예제 #1
0
class BagOfWordSentiment():
    def __init__(self, no_of_grams=4, verbose=True, no_of_testcases=1000):
        self.verbose = verbose
        self.logger = Logger('BagOfWordSentiment',
                             'logs\\bag_of_words.log',
                             is_verbose=self.verbose)

        self.no_of_grams = no_of_grams

        self.double_negations, self.double_negations_collection = set(), set()
        self.negations, self.negation_collection = set(), set()
        self.positive_words, self.positive_word_collection = set(), set()
        self.negative_words, self.negative_word_collection = set(), set()

        self.no_of_testcases = no_of_testcases
        self.positve_test_bag = list()
        self.negative_test_bag = list()

    def ready(self):
        self.logger.info("Bag of words loading")
        self.load_data()
        self.logger.info("Bag of words ready")

    def classify(self, sentence):
        '''
        classifies the sentence to positve or negative or neutral using bag of words method
        '''
        positive_score, negative_score = self.find_score(sentence)

        if positive_score > negative_score:
            self.logger.info("sentence - " + sentence + " - is positive")
            return ("positive", 1, positive_score)

        if positive_score < negative_score:
            self.logger.info("sentence - " + sentence + " - is negative")
            return ("negative", 0, negative_score)

        if positive_score == negative_score:
            self.logger.info("sentence - " + sentence + " - is neutral")
            return ("neutral", -1, positive_score)

    def find_score(self, sentence):
        '''
        finds positive and negative score for a given sentence
        '''
        positive_score, negative_score = 0, 0
        self.logger.info("sentence : " + sentence)
        sentence = self.tokenise(sentence)
        self.logger.info("tokenised sentence after cleaning : " +
                         str(sentence))

        kgrams = list()
        for k in range(self.no_of_grams, 0, -1):
            kgrams.extend(self.get_kgrams(sentence, k))

        for kgram in kgrams:
            phrase = ' '.join(kgram)
            sentence = ' '.join(sentence)

            if phrase in sentence:
                self.logger.info("considering phrase '" + phrase + "' from '" +
                                 sentence + "'")
                #check this phrase for double negation
                contains_double_negation, remaining_phrase = self.is_double_negation(
                    phrase)
                if contains_double_negation:
                    if self.is_positive(remaining_phrase):
                        positive_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info(
                            "double negation of positive phrase : " + phrase)
                        continue
                    if self.is_negative(remaining_phrase):
                        negative_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info(
                            "double negation of negative phrase : " + phrase)
                        continue

                #check this phrase for negations
                contains_negation, remaining_phrase = self.is_negation(phrase)
                if contains_negation:
                    if self.is_positive(remaining_phrase):
                        negative_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info("negation of positive phrase : " +
                                         phrase)
                        continue
                    if self.is_negative(remaining_phrase):
                        positive_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info("negation of negative phrase : " +
                                         phrase)
                        continue

                #check for positive phrase
                if self.is_positive(phrase):
                    positive_score += 1
                    sentence = sentence.replace(phrase, ' ')
                    sentence = self.tokenise(sentence)
                    self.logger.info("positive phrase : " + phrase)
                    continue

                #check for negative phrase
                if self.is_negative(phrase):
                    negative_score += 1
                    sentence = sentence.replace(phrase, ' ')
                    sentence = self.tokenise(sentence)
                    self.logger.info("negative phrase : " + phrase)
                    continue

                self.logger.info("cannot deduce sentiment from phrase '" +
                                 phrase + "'")
            sentence = self.tokenise(sentence)

        return positive_score, negative_score

    def is_double_negation(self, phrase):
        '''
        checks whether a word is in bag of double negations
        '''
        for double_negation in self.double_negations:
            double_negation = double_negation + " "
            if phrase.startswith(double_negation):
                remaining_phrase = phrase.replace(double_negation, '')
                return True, remaining_phrase

        for double_negation in self.double_negations_collection:
            if phrase.startswith(double_negation):
                phrase_length = len(phrase.split(" "))
                double_negation_length = len(double_negation.split(" "))
                diff = phrase_length - double_negation_length

                if diff <= 0:
                    return False, phrase

                remaining_phrase = ' '.join(phrase.split(" ")[-diff:])
                return True, remaining_phrase
        return False, phrase

    def is_negation(self, phrase):
        '''
        checks whether a word is in bag of negations
        '''
        for negation in self.negations:
            negation = negation + " "
            if phrase.startswith(negation):
                remaining_phrase = phrase.replace(negation, '')
                return True, remaining_phrase

        for negation in self.negation_collection:
            if phrase.startswith(negation):
                phrase_length = len(phrase.split(" "))
                negation_length = len(negation.split(" "))
                diff = phrase_length - negation_length

                if diff <= 0:
                    return False, phrase

                remaining_phrase = ' '.join(phrase.split(" ")[-diff:])
                return True, remaining_phrase
        return False, phrase

    def is_positive(self, word):
        '''
        checks whether a word is in bag of positive words
        '''
        if word in self.positive_words:
            return True
        for positive_word in self.positive_word_collection:
            if word.startswith(positive_word):
                return True
        return False

    def is_negative(self, word):
        '''
        checks whether a word is in bag of negative words
        '''
        if word in self.negative_words:
            return True
        for negative_word in self.negative_word_collection:
            if word.startswith(negative_word):
                return True
        return False

    def get_kgrams(self, sentence, k=1):
        '''
        return list of kgrams from a given sentence
        '''
        grams = list()
        for i in range(len(sentence)):
            grams.append(sentence[i:i + k])
            if i + k >= len(sentence):
                break
        return grams

    def load_data(self):
        '''
        loads the data necessary for analysis
        '''
        double_negation_files = [
            'res\\bag_of_words_dataset\\double_negation.txt'
        ]
        negations_files = ['res\\bag_of_words_dataset\\negation.txt']
        positive_word_files = ['res\\bag_of_words_dataset\\positive_words.txt']
        negative_word_files = ['res\\bag_of_words_dataset\\negative_words.txt']

        self.double_negations, self.double_negations_collection = self.get_words(
            self.load_data_from_files(double_negation_files))
        self.negations, self.negation_collection = self.get_words(
            self.load_data_from_files(negations_files))
        self.positive_words, self.positive_word_collection = self.get_words(
            self.load_data_from_files(positive_word_files))
        self.negative_words, self.negative_word_collection = self.get_words(
            self.load_data_from_files(negative_word_files))

        self.logger.info("words loaded")
        self.logger.info("double negations : " + str(
            len(self.double_negations) +
            len(self.double_negations_collection)))
        self.logger.info(
            "negations : " +
            str(len(self.negations) + len(self.negation_collection)))
        self.logger.info(
            "positive words : " +
            str(len(self.positive_words) + len(self.positive_word_collection)))
        self.logger.info(
            "negative words : " +
            str(len(self.negative_words) + len(self.negative_word_collection)))

    def get_words(self, input_words):
        '''
        cleans the input words and group them into set of words and
        set of mulitple word set(words that have different forms)
        '''
        words = set()
        multiple_words = set()
        for word in input_words:
            word = word.replace('\n', '').replace('(1)', '').replace("'", '')
            word = word.replace('_', ' ').replace('-', ' ').strip().lower()
            if '*' in word:
                word = word.replace('*', '')
                multiple_words.add(word.strip())
                continue
            words.add(word)
        return words, multiple_words

    def tokenise(self, sentence):
        '''
        split the sentence into words
        '''
        sentence = self.clean(sentence)
        tokens = sentence.split(' ')
        filtered_tokens = list()
        for token in tokens:
            if len(token.strip()) != 0:
                filtered_tokens.append(token)
        return filtered_tokens

    def clean(self, sentence):
        '''
        clean the sentence by removing ignored characters
        '''
        ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{!}?:;_-'''
        sentence = sentence.lower().strip()
        sentence = self.remove_stop_words(sentence)
        sentence = self.replace_characters(sentence, ignore_characters)
        sentence = sentence.replace("'", '')
        return sentence.lower().strip()

    def remove_stop_words(self, sentence):
        stop_words = self.load_data_from_files(
            ['res\\bag_of_words_dataset\\refined_stop_words.txt'])
        sentence = sentence.split(" ")
        stop_word_set = set()
        for stop_word in stop_words:
            stop_word_set.add(
                stop_word.replace('\n', '').replace('\t', '').strip())
        new_sentence = list()
        for word in sentence:
            if word not in stop_word_set:
                new_sentence.append(word)
        return ' '.join(new_sentence)

    def replace_characters(self, text, characters):
        '''
        replace the specified characters from text to blank spaces
        '''
        for char in characters:
            text = text.replace(char, ' ')
        return text

    def load_data_from_files(self, filenames, encoding="utf8"):
        '''
        load the data as a list from the specified filenames
        '''
        data = list()
        for filename in filenames:
            with open(filename, encoding=encoding) as file:
                data.extend(file.readlines())
        return data

    def find_accuracy(self):

        self.load_test_cases()
        self.create_test_set()

        correct, wrong = 0, 0
        total = len(self.positve_test_bag) + len(self.negative_test_bag)

        _correct, _wrong = self.test_for_bag(self.positve_test_bag,
                                             actual_result=1)
        correct += _correct
        wrong += _wrong

        _correct, _wrong = self.test_for_bag(self.negative_test_bag,
                                             actual_result=0)
        correct += _correct
        wrong += _wrong

        self.accuracy = (correct / total) * 100
        self.logger.info("total test sentences : " + str(total))
        self.logger.info("correct output : " + str(correct))
        self.logger.info("wrong output : " + str(wrong))
        self.logger.info("accuracy (%) : " + str(int(self.accuracy)))
        return (self.accuracy, total, correct, wrong)

    def test_for_bag(self, bag, actual_result):
        self.logger.is_verbose = False
        correct, wrong = 0, 0
        for sentence in bag:
            result = self.classify(sentence=sentence)
            if result[1] == actual_result:
                correct += 1
            else:
                wrong += 1
        self.logger.is_verbose = True
        self.logger.debug("total test sentences in bag : " + str(len(bag)))
        self.logger.debug("correct output : " + str(correct))
        self.logger.debug("wrong output : " + str(wrong))
        self.logger.debug("accuracy (%) : " +
                          str(int((correct / len(bag)) * 100)))
        return correct, wrong

    def create_test_set(self):
        '''
        randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences
        '''
        from numpy import random as np_random
        count = self.no_of_testcases // 2
        while (count != 0):
            index = np_random.random_integers(low=0,
                                              high=len(self.positive_bag) - 1)
            self.positve_test_bag.append(self.positive_bag.pop(index))
            index = np_random.random_integers(low=0,
                                              high=len(self.negative_bag) - 1)
            self.negative_test_bag.append(self.negative_bag.pop(index))
            count -= 1

        self.logger.info("test sentences selected")
        self.logger.info(
            "Total sentences for testing : " +
            str(len(self.positve_test_bag) + len(self.negative_test_bag)))
        self.logger.info("positive sentences for testing : " +
                         str(len(self.positve_test_bag)))
        self.logger.info("negative sentences for testing : " +
                         str(len(self.negative_test_bag)))

    def load_test_cases(self):
        '''
        loads the positive and negative sentences from filenames specified
        '''
        mixed_bag_paths = [
            'res\\dataset\\uci_dataset\\yelp_labelled.txt',
            'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt',
            'res\\dataset\\uci_dataset\\imdb_labelled.txt'
        ]

        #followed training sets contain hard testcases
        positive_bag_paths = [
            'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt'
        ]
        negative_bag_paths = [
            'res\dataset\polarity_dataset\\rt-polarity-neg.txt'
        ]
        #uncomment below two lines not to include difficult testcases
        # positive_bag_paths = []
        # negative_bag_paths = []

        self.positive_bag, self.negative_bag = list(), list()
        count_positive, count_negative = 0, 0
        for filename in mixed_bag_paths:
            for mixed_data in self.load_data_from_files([filename]):
                sentence, label = mixed_data.split('\t')
                label = int(label)
                if label == 1:  #if sentence is positive
                    self.positive_bag.append(sentence)
                    count_positive += 1
                else:
                    self.negative_bag.append(sentence)
                    count_negative += 1
        self.logger.debug("sentences from mixed bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))
        self.logger.debug("negative sentences : " + str(count_negative))

        count_positive = 0
        for filename in positive_bag_paths:
            for sentence in self.load_data_from_files([filename]):
                self.positive_bag.append(sentence)
                count_positive += 1
        self.logger.debug("sentences from positive bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))

        count_negative = 0
        for filename in negative_bag_paths:
            for sentence in self.load_data_from_files([filename]):
                self.negative_bag.append(sentence)
                count_negative += 1
        self.logger.debug("sentences from negative bag imported")
        self.logger.debug("negative sentences : " + str(count_negative))

        self.logger.debug("sentences imported")
        self.logger.debug("Total sentences : " +
                          str(len(self.positive_bag) + len(self.negative_bag)))
        self.logger.debug("positive sentences : " +
                          str(len(self.positive_bag)))
        self.logger.debug("negative sentences : " +
                          str(len(self.negative_bag)))
class NaiveBayers():
    def __init__(self, verbose=True, training_cases=2500, testcases=500):
        self.verbose = verbose
        self.training_cases = training_cases
        self.testcases = testcases
        self.training = list()
        self.test = list()
        self.frequency = dict()
        self.stop_words = self.get_stop_words()
        self.positive_words = 0
        self.negative_words = 0
        self.positive_sentence_count = 0
        self.negative_sentence_count = 0
        self.total_sentences = 0
        self.logger = Logger('NaiveBayers', 'NaiveBayers.log')
        self.filenames = [
            'res\\benchmark\\yelp_labelled.txt',
            'res\\benchmark\\amazon_cells_labelled.txt',
            'res\\benchmark\\imdb_labelled.txt'
        ]

    def _print(self, message):
        if self.verbose:
            print(message)

    def clean(self, sentence):
        ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?'''
        sentence = self.replace_characters(sentence, ignore_characters)
        return sentence.lower().strip()

    def tokenise(self, sentence):
        sentence = self.clean(sentence)
        tokens = sentence.split(' ')
        filtered_tokens = list()
        for token in tokens:
            if len(token.strip()) != 0:
                filtered_tokens.append(token)
        return filtered_tokens

    def replace_characters(self, text, characters):
        for char in characters:
            text = text.replace(char, ' ')
        return text

    def get_data(self):
        data = list()
        for filename in self.filenames:
            self._print("Filename : " + filename)
            for datum in tqdm(self.load_data_from_file(filename)):
                sentence, label = datum.split('\t')
                label = int(label)
                sentence = self.clean(sentence)
                data.append([sentence, label])
        self.training = data[:self.training_cases]
        self.test = data[-self.testcases:]

    def load_data_from_file(self, filename, encoding="utf8"):
        with open(filename, encoding=encoding) as file:
            data = file.readlines()
        return data

    def get_kgrams(self, sentence, k=1):
        grams = list()
        for i in range(len(sentence)):
            grams.append(sentence[i:i + k])
            if i + k >= len(sentence):
                break
        return grams

    def train(self):
        # try:
        #     with open('frequency.pickle', "rb") as file:
        #         self.frequency = pickle.load(file)
        #     with open("count.pickle", "rb") as file:
        #         self.positive_words, self.negative_words = pickle.load(file)
        # except Exception as error:
        #     self.logger.debug("Frequency file not found")
        #     self.train_unigrams()
        self.find_frequency_unigrams()
        self.train_from_negative_sentences()
        self.train_from_positive_sentences()
        # print(self.positive_words)
        # print(self.negative_words)
        # print(len(self.frequency))
        self.find_probablility_unigrams()
        # print(len(self.probablility))
        self.logger.info("Training completed")
        self.logger.info("Number of positive sentences : " +
                         str(self.positive_sentence_count))
        self.logger.info("Number of negative sentences : " +
                         str(self.negative_sentence_count))

    def classify(self, sentence):
        sentence = self.preprocess(sentence)
        positive_probablity = self.positive_sentence_count / self.total_sentences
        negative_probablity = self.negative_sentence_count / self.total_sentences
        self.logger.debug("sentence : " + str(sentence))
        self.logger.debug("words considered : ")
        for word in sentence:
            word = word[0]
            word_positive_probability, word_negative_probability = 1, 1
            if word in self.probablility:
                word_positive_probability, word_negative_probability = self.probablility[
                    word]
                self.logger.debug("word : " + word +
                                  " word_positive_probability : " +
                                  str(word_positive_probability) +
                                  " word_negative_probability : " +
                                  str(word_negative_probability))
            positive_probablity *= word_positive_probability
            negative_probablity *= word_negative_probability

        self.logger.debug("positive_probablity : " + str(positive_probablity))
        self.logger.debug("negative_probablity : " + str(negative_probablity))

        # if abs(positive_probablity - negative_probablity) < 0.0000000000000001:
        #     self.logger.debug("sentence is neutral")
        #     return ("neutral" , -1)
        if positive_probablity > negative_probablity:
            self.logger.debug("sentence is positive")
            return ("positive", 1)
        if negative_probablity > positive_probablity:
            self.logger.debug("sentence is negative")
            return ("negative", 0)

    def test_classifier(self):
        correct, wrong = 0, 0
        total = len(self.test)
        for sentence, actual_label in self.test:
            verdict, label = self.classify(sentence)
            if label == actual_label:
                correct += 1
            else:
                wrong += 1

        self.logger.info("correct : " + str(correct))
        self.logger.info("wrong : " + str(wrong))
        self.logger.info("total : " + str(total))
        self.logger.info("accuracy : " + str(int((correct / total) * 100)))

    def get_stop_words(self):
        data = self.load_data_from_file('res\\eng_stop_words.txt')
        return set([datum.replace('\n', '') for datum in data])

    def remove_stop_words(self, sentence):
        filtered_words = list()
        for word in sentence:
            if word in self.stop_words:
                continue
            filtered_words.append(word)
        return filtered_words

    def find_probablility_unigrams(self):
        self.probablility = dict()
        for word in self.frequency:
            positive_probablity = (self.frequency[word][0] + 1) / (
                self.positive_words + len(self.frequency))
            negative_probablity = (self.frequency[word][1] + 1) / (
                self.negative_words + len(self.frequency))
            self.probablility[word] = [
                positive_probablity, negative_probablity
            ]

    def preprocess(self, sentence):
        sentence = self.tokenise(sentence)
        #sentence = self.remove_stop_words(sentence)
        sentence = self.get_kgrams(sentence, k=1)
        return sentence

    def train_from_negative_sentences(self):
        negative_files = ['res\\rt-polaritydata\\rt-polarity-neg.txt']
        for filename in negative_files:
            new_sentences = self.load_data_from_file(filename)

        for sentence in new_sentences:
            sentence = self.preprocess(sentence)
            self.negative_sentence_count += 1
            for word in sentence:
                word = word[0]
                if word not in self.frequency:
                    self.frequency[word] = [0, 0]
                self.frequency[word][1] += 1
                self.negative_words += 1

    def train_from_positive_sentences(self):
        positive_files = ['res\\rt-polaritydata\\rt-polarity-pos.txt']

        data = list()
        for filename in positive_files:
            new_sentences = self.load_data_from_file(filename)

        for sentence in new_sentences:
            sentence = self.preprocess(sentence)
            self.positive_sentence_count += 1
            for word in sentence:
                word = word[0]
                if word not in self.frequency:
                    self.frequency[word] = [0, 0]
                self.frequency[word][0] += 1
                self.positive_words += 1

    def find_frequency_unigrams(self):
        for sentence, label in self.training:
            self.total_sentences += 1
            sentence = self.preprocess(sentence)
            if label == 1:
                #positive sentence
                self.positive_sentence_count += 1
                for word in sentence:
                    word = word[0]
                    if word not in self.frequency:
                        self.frequency[word] = [0, 0]
                    self.frequency[word][0] += 1
                    self.positive_words += 1
            elif label == 0:
                #negative sentence
                self.negative_sentence_count += 1
                for word in sentence:
                    word = word[0]
                    if word not in self.frequency:
                        self.frequency[word] = [0, 0]
                    self.frequency[word][1] += 1
                    self.negative_words += 1
예제 #3
0
class NaiveBayes():
    '''
    implementation of Naive Bayes classifer
    '''
    def __init__(self, verbose=True, test_set_count=500, no_of_grams=1):
        self.logger = Logger('NaiveBayes',
                             'logs\\NaiveBayes.log',
                             is_verbose=verbose)
        self.verbose = verbose
        self.counts = dict()
        self.positive_bag = []
        self.negative_bag = []

        self.positve_test_bag = []
        self.negative_test_bag = []

        self.counts["test set"] = test_set_count
        self.counts["positive phrases"] = 0
        self.counts["negative phrases"] = 0
        self.counts["total sentences"] = 0
        self.counts["positive sentences"] = 0
        self.counts["negative sentences"] = 0

        self.no_of_grams = no_of_grams

        self.phrase_occurrences = dict()
        self.phrase_probabilities = dict()

    def ready(self):
        self.logger.info("starting Naive Bayers classifier")
        self.load_data()
        self.create_test_set()
        self.fit()
        # self.find_accuracy()
        self.logger.info("Naive Bayers classifier ready.")

    def classify(self, sentence):
        '''
        classifies a given sentence to positive or negative class
        '''
        positive_probablity, negative_probablity = self.find_conditional_probability(
            sentence)

        # if positive_probablity == 1 and negative_probablity == 1: #unable to classify a sentence
        #     self.logger.debug("sentence - " + sentence + " - is neutral")
        #     return ("neutral", -1, positive_probablity)

        if positive_probablity == 1 and negative_probablity != 1:
            self.logger.info("sentence - " + sentence + " - is negative")
            return ("negative", 0, negative_probablity)

        if positive_probablity != 1 and negative_probablity == 1:
            self.logger.info("sentence - " + sentence + " - is positive")
            return ("positive", 1, positive_probablity)

        if positive_probablity > negative_probablity:
            self.logger.info("sentence - " + sentence + " - is positive")
            return ("positive", 1, positive_probablity)

        if negative_probablity > positive_probablity:
            self.logger.info("sentence - " + sentence + " - is negative")
            return ("negative", 0, negative_probablity)

        if negative_probablity == positive_probablity:  #unable to classify a sentence
            self.logger.info("sentence - " + sentence + " - is neutral")
            self.logger.info("no sense can be deduced from this sentence")
            return ("neutral", -1, positive_probablity)

    def find_conditional_probability(self, sentence):
        '''
        finds the conditional probablity for a given sentence from phrase_probabilities
        '''
        sentence_str = sentence
        sentence = self.preprocess(sentence)

        sentence_positive_probablity = 1
        sentence_negative_probablity = 1

        positive_class_probability = self.counts[
            "positive sentences"] / self.counts["total sentences"]
        negative_class_probability = self.counts[
            "negative sentences"] / self.counts["total sentences"]

        sentence_positive_probablity *= positive_class_probability
        sentence_negative_probablity *= negative_class_probability

        kgrams = list()
        for k in range(self.no_of_grams, 0, -1):
            kgrams.extend(self.get_kgrams(sentence, k))

        for kgram in kgrams:  #this give around 80%
            phrase = ' '.join(kgram)
            sentence = ' '.join(sentence)
            if phrase in sentence and phrase in self.phrase_probabilities:
                phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[
                    phrase]
                count = sentence.count(phrase)
                self.logger.info(phrase + " " +
                                 str(phrase_positive_probability) + " " +
                                 str(phrase_negative_probability) + " " +
                                 str(count))
                sentence_positive_probablity *= phrase_positive_probability**count
                sentence_negative_probablity *= phrase_negative_probability**count
                sentence = sentence.replace(phrase, ' ')
            sentence = self.preprocess(sentence)

        # for kgram in kgrams: #this give 75%
        #     phrase = ' '.join(kgram)
        #     if phrase in self.phrase_probabilities:
        #         phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[phrase]
        #         self.logger.debug(phrase + " " + str(phrase_positive_probability) + " " + str(phrase_negative_probability))
        #         sentence_positive_probablity *= phrase_positive_probability
        #         sentence_negative_probablity *= phrase_negative_probability

        return sentence_positive_probablity, sentence_negative_probablity

    def fit(self):
        '''
        trains the model with sentences in positive and negative bags
        '''
        self.logger.info("training started")
        self.logger.info("total sentences : " +
                         str(self.counts["total sentences"]))
        self.logger.info("positive sentences : " +
                         str(self.counts["positive sentences"]))
        self.logger.info("negative sentences : " +
                         str(self.counts["negative sentences"]))

        self.get_occurrences_from_bags()
        self.logger.info("calculated occurrences")
        self.logger.info("unique phrases : " +
                         str(len(self.phrase_occurrences)))
        self.logger.info("phrases in positive class : " +
                         str(self.counts["positive phrases"]))
        self.logger.info("phrases in negative class : " +
                         str(self.counts["negative phrases"]))

        self.get_conditional_probabilities()
        self.logger.info("conditional probality for phrases calculated")
        self.logger.info("training completed")

    def get_conditional_probabilities(self):
        '''
        calculates the conditional probability for phrase|positive class and phrase|negative class
        '''
        total_unique_phrases = len(self.phrase_occurrences)
        for phrase in self.phrase_occurrences:
            positive_probablity = (self.phrase_occurrences[phrase][0] + 1) / (
                self.counts["positive phrases"] + total_unique_phrases)
            negative_probablity = (self.phrase_occurrences[phrase][1] + 1) / (
                self.counts["negative phrases"] + total_unique_phrases)
            self.phrase_probabilities[phrase] = [
                positive_probablity, negative_probablity
            ]

    def get_occurrences_from_bags(self):
        '''
        calculates the occurrences of the phrases
        '''
        self.get_occurrences_from_positive_bag()
        self.get_occurrences_from_negative_bag()

    def get_occurrences_from_positive_bag(self):
        '''
        calculates the occurrences of unigram, bigram, trigram and quadgram from positive bag
        '''
        for sentence in self.positive_bag:
            kgrams = list()
            for k in range(1, self.no_of_grams + 1):
                kgrams.extend(self.get_kgrams(sentence, k))
            for kgram in kgrams:
                phrase = ' '.join(kgram)
                self.counts["positive phrases"] += 1
                if phrase not in self.phrase_occurrences:
                    self.phrase_occurrences[phrase] = [
                        0, 0
                    ]  #[word occurrence in positive class, word occurrence in negative class]
                self.phrase_occurrences[phrase][0] += 1

    def get_occurrences_from_negative_bag(self):
        '''
        calculates the occurrences of unigram, bigram, trigram and quadgram from negative bag
        '''
        for sentence in self.negative_bag:
            kgrams = list()
            for k in range(1, self.no_of_grams + 1):
                kgrams.extend(self.get_kgrams(sentence, k))
            for kgram in kgrams:
                phrase = ' '.join(kgram)
                self.counts["negative phrases"] += 1
                if phrase not in self.phrase_occurrences:
                    self.phrase_occurrences[phrase] = [0, 0]
                self.phrase_occurrences[phrase][1] += 1

    def get_kgrams(self, sentence, k=1):
        '''
        return list of kgrams from a given sentence
        '''
        grams = list()
        for i in range(len(sentence)):
            grams.append(sentence[i:i + k])
            if i + k >= len(sentence):
                break
        return grams

    def create_test_set(self):
        '''
        randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences
        '''
        from numpy import random as np_random
        count = self.counts["test set"] // 2
        while (count != 0):
            index = np_random.random_integers(low=0,
                                              high=len(self.positive_bag) - 1)
            self.positve_test_bag.append(self.positive_bag.pop(index))
            index = np_random.random_integers(low=0,
                                              high=len(self.negative_bag) - 1)
            self.negative_test_bag.append(self.negative_bag.pop(index))
            count -= 1

        self.logger.info("test sentences selected")
        self.logger.info(
            "Total sentences for testing : " +
            str(len(self.positve_test_bag) + len(self.negative_test_bag)))
        self.logger.info("positive sentences for testing : " +
                         str(len(self.positve_test_bag)))
        self.logger.info("negative sentences for testing : " +
                         str(len(self.negative_test_bag)))

        self.counts["positive sentences"] = len(self.positive_bag)
        self.counts["negative sentences"] = len(self.negative_bag)
        self.counts["total sentences"] = len(self.positive_bag) + len(
            self.negative_bag)

    def load_data(self):
        '''
        loads the positive and negative sentences from filenames specified
        '''
        mixed_bag_paths = [
            'res\\dataset\\uci_dataset\\yelp_labelled.txt',
            'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt',
            'res\\dataset\\uci_dataset\\imdb_labelled.txt'
        ]

        positive_bag_paths = [
            'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt'
        ]
        negative_bag_paths = [
            'res\\dataset\\polarity_dataset\\rt-polarity-neg.txt'
        ]

        count_positive, count_negative = 0, 0
        for filename in mixed_bag_paths:
            for mixed_data in self.load_data_from_file(filename):
                sentence, label = mixed_data.split('\t')
                label = int(label)
                sentence = self.preprocess(sentence)
                if label == 1:  #if sentence is positive
                    self.positive_bag.append(sentence)
                    count_positive += 1
                else:
                    self.negative_bag.append(sentence)
                    count_negative += 1
        self.logger.debug("sentences from mixed bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))
        self.logger.debug("negative sentences : " + str(count_negative))

        count_positive = 0
        for filename in positive_bag_paths:
            for sentence in self.load_data_from_file(filename):
                sentence = self.preprocess(sentence)
                self.positive_bag.append(sentence)
                count_positive += 1
        self.logger.debug("sentences from positive bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))

        count_negative = 0
        for filename in negative_bag_paths:
            for sentence in self.load_data_from_file(filename):
                sentence = self.preprocess(sentence)
                self.negative_bag.append(sentence)
                count_negative += 1
        self.logger.debug("sentences from negative bag imported")
        self.logger.debug("negative sentences : " + str(count_negative))

        self.counts["positive sentences"] = len(self.positive_bag)
        self.counts["negative sentences"] = len(self.negative_bag)
        self.counts["total sentences"] = len(self.positive_bag) + len(
            self.negative_bag)

        self.logger.info("sentences imported")
        self.logger.info("Total sentences : " +
                         str(self.counts["total sentences"]))
        self.logger.info("positive sentences : " +
                         str(self.counts["positive sentences"]))
        self.logger.info("negative sentences : " +
                         str(self.counts["negative sentences"]))

    def load_data_from_file(self, filename, encoding="utf8"):
        '''
        load the data as a list from the specified filename
        '''
        with open(filename, encoding=encoding) as file:
            data = file.readlines()
        return data

    def preprocess(self, sentence):
        '''
        preprocess the sentence and return as a list of words
        '''
        sentence = self.tokenise(sentence)
        #sentence = self.remove_stop_words(sentence)
        return sentence

    def tokenise(self, sentence):
        '''
        convert the sentence to list of words
        '''
        sentence = self.clean(sentence)
        tokens = sentence.split(' ')
        filtered_tokens = list()
        for token in tokens:
            if len(token.strip()) != 0:
                filtered_tokens.append(token.strip())
        return filtered_tokens

    def clean(self, sentence):
        '''
        clean sentence by removing the ignored characters
        '''
        ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?'''
        sentence = self.replace_characters(sentence, ignore_characters)
        return sentence.lower().strip()

    def replace_characters(self, text, characters):
        '''
        replaces the specified characters in text with blank space
        '''
        for char in characters:
            text = text.replace(char, ' ')
        return text

    def get_positive_test_bag(self):
        return self.positve_test_bag

    def get_negative_test_bag(self):
        return self.negative_test_bag

    def test_for_fish_guitar(self):
        positive_sentences = [
            "fish smoked fish", "fish line", "fish haul smoked"
        ]
        negative_sentences = ["guitar jazz line"]
        self.positive_bag = [
            sentence.split(" ") for sentence in positive_sentences
        ]
        self.negative_bag = [
            sentence.split(" ") for sentence in negative_sentences
        ]
        self.counts["total sentences"] = len(self.positive_bag) + len(
            self.negative_bag)
        self.counts["positive sentences"] = len(self.positive_bag)
        self.counts["negative sentences"] = len(self.negative_bag)

        self.get_occurrences_from_bags()
        self.get_conditional_probabilities()

        test_sentence = "line guitar jazz jazz"
        result = self.classify(sentence=test_sentence)
        self.logger.info(str(result))
        return result

    def find_accuracy(self):
        correct, wrong = 0, 0
        total = len(self.positve_test_bag) + len(self.negative_test_bag)

        _correct, _wrong = self.test_for_bag(self.positve_test_bag,
                                             actual_result=1)
        correct += _correct
        wrong += _wrong

        _correct, _wrong = self.test_for_bag(self.negative_test_bag,
                                             actual_result=0)
        correct += _correct
        wrong += _wrong

        self.accuracy = (correct / total) * 100
        self.logger.info("total test sentences : " + str(total))
        self.logger.info("correct output : " + str(correct))
        self.logger.info("wrong output : " + str(wrong))
        self.logger.info("accuracy (%) : " + str(int(self.accuracy)))

    def test_for_bag(self, bag, actual_result):
        self.logger.is_verbose = False
        correct, wrong = 0, 0
        for sentence in bag:
            sentence = ' '.join(sentence)
            result = self.classify(sentence=sentence)
            if result is None:
                self.logger.info("result is none : " + str(sentence))
                wrong += 1
                continue
            if result[1] == actual_result:
                correct += 1
            else:
                wrong += 1
        self.logger.is_verbose = True
        self.logger.debug("total test sentences in bag : " + str(len(bag)))
        self.logger.debug("correct output : " + str(correct))
        self.logger.debug("wrong output : " + str(wrong))
        self.logger.debug("accuracy (%) : " +
                          str(int((correct / len(bag)) * 100)))
        return correct, wrong