class BagOfWordSentiment(): def __init__(self, no_of_grams=4, verbose=True, no_of_testcases=1000): self.verbose = verbose self.logger = Logger('BagOfWordSentiment', 'logs\\bag_of_words.log', is_verbose=self.verbose) self.no_of_grams = no_of_grams self.double_negations, self.double_negations_collection = set(), set() self.negations, self.negation_collection = set(), set() self.positive_words, self.positive_word_collection = set(), set() self.negative_words, self.negative_word_collection = set(), set() self.no_of_testcases = no_of_testcases self.positve_test_bag = list() self.negative_test_bag = list() def ready(self): self.logger.info("Bag of words loading") self.load_data() self.logger.info("Bag of words ready") def classify(self, sentence): ''' classifies the sentence to positve or negative or neutral using bag of words method ''' positive_score, negative_score = self.find_score(sentence) if positive_score > negative_score: self.logger.info("sentence - " + sentence + " - is positive") return ("positive", 1, positive_score) if positive_score < negative_score: self.logger.info("sentence - " + sentence + " - is negative") return ("negative", 0, negative_score) if positive_score == negative_score: self.logger.info("sentence - " + sentence + " - is neutral") return ("neutral", -1, positive_score) def find_score(self, sentence): ''' finds positive and negative score for a given sentence ''' positive_score, negative_score = 0, 0 self.logger.info("sentence : " + sentence) sentence = self.tokenise(sentence) self.logger.info("tokenised sentence after cleaning : " + str(sentence)) kgrams = list() for k in range(self.no_of_grams, 0, -1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: phrase = ' '.join(kgram) sentence = ' '.join(sentence) if phrase in sentence: self.logger.info("considering phrase '" + phrase + "' from '" + sentence + "'") #check this phrase for double negation contains_double_negation, remaining_phrase = self.is_double_negation( phrase) if contains_double_negation: if self.is_positive(remaining_phrase): positive_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info( "double negation of positive phrase : " + phrase) continue if self.is_negative(remaining_phrase): negative_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info( "double negation of negative phrase : " + phrase) continue #check this phrase for negations contains_negation, remaining_phrase = self.is_negation(phrase) if contains_negation: if self.is_positive(remaining_phrase): negative_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("negation of positive phrase : " + phrase) continue if self.is_negative(remaining_phrase): positive_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("negation of negative phrase : " + phrase) continue #check for positive phrase if self.is_positive(phrase): positive_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("positive phrase : " + phrase) continue #check for negative phrase if self.is_negative(phrase): negative_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("negative phrase : " + phrase) continue self.logger.info("cannot deduce sentiment from phrase '" + phrase + "'") sentence = self.tokenise(sentence) return positive_score, negative_score def is_double_negation(self, phrase): ''' checks whether a word is in bag of double negations ''' for double_negation in self.double_negations: double_negation = double_negation + " " if phrase.startswith(double_negation): remaining_phrase = phrase.replace(double_negation, '') return True, remaining_phrase for double_negation in self.double_negations_collection: if phrase.startswith(double_negation): phrase_length = len(phrase.split(" ")) double_negation_length = len(double_negation.split(" ")) diff = phrase_length - double_negation_length if diff <= 0: return False, phrase remaining_phrase = ' '.join(phrase.split(" ")[-diff:]) return True, remaining_phrase return False, phrase def is_negation(self, phrase): ''' checks whether a word is in bag of negations ''' for negation in self.negations: negation = negation + " " if phrase.startswith(negation): remaining_phrase = phrase.replace(negation, '') return True, remaining_phrase for negation in self.negation_collection: if phrase.startswith(negation): phrase_length = len(phrase.split(" ")) negation_length = len(negation.split(" ")) diff = phrase_length - negation_length if diff <= 0: return False, phrase remaining_phrase = ' '.join(phrase.split(" ")[-diff:]) return True, remaining_phrase return False, phrase def is_positive(self, word): ''' checks whether a word is in bag of positive words ''' if word in self.positive_words: return True for positive_word in self.positive_word_collection: if word.startswith(positive_word): return True return False def is_negative(self, word): ''' checks whether a word is in bag of negative words ''' if word in self.negative_words: return True for negative_word in self.negative_word_collection: if word.startswith(negative_word): return True return False def get_kgrams(self, sentence, k=1): ''' return list of kgrams from a given sentence ''' grams = list() for i in range(len(sentence)): grams.append(sentence[i:i + k]) if i + k >= len(sentence): break return grams def load_data(self): ''' loads the data necessary for analysis ''' double_negation_files = [ 'res\\bag_of_words_dataset\\double_negation.txt' ] negations_files = ['res\\bag_of_words_dataset\\negation.txt'] positive_word_files = ['res\\bag_of_words_dataset\\positive_words.txt'] negative_word_files = ['res\\bag_of_words_dataset\\negative_words.txt'] self.double_negations, self.double_negations_collection = self.get_words( self.load_data_from_files(double_negation_files)) self.negations, self.negation_collection = self.get_words( self.load_data_from_files(negations_files)) self.positive_words, self.positive_word_collection = self.get_words( self.load_data_from_files(positive_word_files)) self.negative_words, self.negative_word_collection = self.get_words( self.load_data_from_files(negative_word_files)) self.logger.info("words loaded") self.logger.info("double negations : " + str( len(self.double_negations) + len(self.double_negations_collection))) self.logger.info( "negations : " + str(len(self.negations) + len(self.negation_collection))) self.logger.info( "positive words : " + str(len(self.positive_words) + len(self.positive_word_collection))) self.logger.info( "negative words : " + str(len(self.negative_words) + len(self.negative_word_collection))) def get_words(self, input_words): ''' cleans the input words and group them into set of words and set of mulitple word set(words that have different forms) ''' words = set() multiple_words = set() for word in input_words: word = word.replace('\n', '').replace('(1)', '').replace("'", '') word = word.replace('_', ' ').replace('-', ' ').strip().lower() if '*' in word: word = word.replace('*', '') multiple_words.add(word.strip()) continue words.add(word) return words, multiple_words def tokenise(self, sentence): ''' split the sentence into words ''' sentence = self.clean(sentence) tokens = sentence.split(' ') filtered_tokens = list() for token in tokens: if len(token.strip()) != 0: filtered_tokens.append(token) return filtered_tokens def clean(self, sentence): ''' clean the sentence by removing ignored characters ''' ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{!}?:;_-''' sentence = sentence.lower().strip() sentence = self.remove_stop_words(sentence) sentence = self.replace_characters(sentence, ignore_characters) sentence = sentence.replace("'", '') return sentence.lower().strip() def remove_stop_words(self, sentence): stop_words = self.load_data_from_files( ['res\\bag_of_words_dataset\\refined_stop_words.txt']) sentence = sentence.split(" ") stop_word_set = set() for stop_word in stop_words: stop_word_set.add( stop_word.replace('\n', '').replace('\t', '').strip()) new_sentence = list() for word in sentence: if word not in stop_word_set: new_sentence.append(word) return ' '.join(new_sentence) def replace_characters(self, text, characters): ''' replace the specified characters from text to blank spaces ''' for char in characters: text = text.replace(char, ' ') return text def load_data_from_files(self, filenames, encoding="utf8"): ''' load the data as a list from the specified filenames ''' data = list() for filename in filenames: with open(filename, encoding=encoding) as file: data.extend(file.readlines()) return data def find_accuracy(self): self.load_test_cases() self.create_test_set() correct, wrong = 0, 0 total = len(self.positve_test_bag) + len(self.negative_test_bag) _correct, _wrong = self.test_for_bag(self.positve_test_bag, actual_result=1) correct += _correct wrong += _wrong _correct, _wrong = self.test_for_bag(self.negative_test_bag, actual_result=0) correct += _correct wrong += _wrong self.accuracy = (correct / total) * 100 self.logger.info("total test sentences : " + str(total)) self.logger.info("correct output : " + str(correct)) self.logger.info("wrong output : " + str(wrong)) self.logger.info("accuracy (%) : " + str(int(self.accuracy))) return (self.accuracy, total, correct, wrong) def test_for_bag(self, bag, actual_result): self.logger.is_verbose = False correct, wrong = 0, 0 for sentence in bag: result = self.classify(sentence=sentence) if result[1] == actual_result: correct += 1 else: wrong += 1 self.logger.is_verbose = True self.logger.debug("total test sentences in bag : " + str(len(bag))) self.logger.debug("correct output : " + str(correct)) self.logger.debug("wrong output : " + str(wrong)) self.logger.debug("accuracy (%) : " + str(int((correct / len(bag)) * 100))) return correct, wrong def create_test_set(self): ''' randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences ''' from numpy import random as np_random count = self.no_of_testcases // 2 while (count != 0): index = np_random.random_integers(low=0, high=len(self.positive_bag) - 1) self.positve_test_bag.append(self.positive_bag.pop(index)) index = np_random.random_integers(low=0, high=len(self.negative_bag) - 1) self.negative_test_bag.append(self.negative_bag.pop(index)) count -= 1 self.logger.info("test sentences selected") self.logger.info( "Total sentences for testing : " + str(len(self.positve_test_bag) + len(self.negative_test_bag))) self.logger.info("positive sentences for testing : " + str(len(self.positve_test_bag))) self.logger.info("negative sentences for testing : " + str(len(self.negative_test_bag))) def load_test_cases(self): ''' loads the positive and negative sentences from filenames specified ''' mixed_bag_paths = [ 'res\\dataset\\uci_dataset\\yelp_labelled.txt', 'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt', 'res\\dataset\\uci_dataset\\imdb_labelled.txt' ] #followed training sets contain hard testcases positive_bag_paths = [ 'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt' ] negative_bag_paths = [ 'res\dataset\polarity_dataset\\rt-polarity-neg.txt' ] #uncomment below two lines not to include difficult testcases # positive_bag_paths = [] # negative_bag_paths = [] self.positive_bag, self.negative_bag = list(), list() count_positive, count_negative = 0, 0 for filename in mixed_bag_paths: for mixed_data in self.load_data_from_files([filename]): sentence, label = mixed_data.split('\t') label = int(label) if label == 1: #if sentence is positive self.positive_bag.append(sentence) count_positive += 1 else: self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from mixed bag imported") self.logger.debug("positive sentences : " + str(count_positive)) self.logger.debug("negative sentences : " + str(count_negative)) count_positive = 0 for filename in positive_bag_paths: for sentence in self.load_data_from_files([filename]): self.positive_bag.append(sentence) count_positive += 1 self.logger.debug("sentences from positive bag imported") self.logger.debug("positive sentences : " + str(count_positive)) count_negative = 0 for filename in negative_bag_paths: for sentence in self.load_data_from_files([filename]): self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from negative bag imported") self.logger.debug("negative sentences : " + str(count_negative)) self.logger.debug("sentences imported") self.logger.debug("Total sentences : " + str(len(self.positive_bag) + len(self.negative_bag))) self.logger.debug("positive sentences : " + str(len(self.positive_bag))) self.logger.debug("negative sentences : " + str(len(self.negative_bag)))
class NaiveBayers(): def __init__(self, verbose=True, training_cases=2500, testcases=500): self.verbose = verbose self.training_cases = training_cases self.testcases = testcases self.training = list() self.test = list() self.frequency = dict() self.stop_words = self.get_stop_words() self.positive_words = 0 self.negative_words = 0 self.positive_sentence_count = 0 self.negative_sentence_count = 0 self.total_sentences = 0 self.logger = Logger('NaiveBayers', 'NaiveBayers.log') self.filenames = [ 'res\\benchmark\\yelp_labelled.txt', 'res\\benchmark\\amazon_cells_labelled.txt', 'res\\benchmark\\imdb_labelled.txt' ] def _print(self, message): if self.verbose: print(message) def clean(self, sentence): ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?''' sentence = self.replace_characters(sentence, ignore_characters) return sentence.lower().strip() def tokenise(self, sentence): sentence = self.clean(sentence) tokens = sentence.split(' ') filtered_tokens = list() for token in tokens: if len(token.strip()) != 0: filtered_tokens.append(token) return filtered_tokens def replace_characters(self, text, characters): for char in characters: text = text.replace(char, ' ') return text def get_data(self): data = list() for filename in self.filenames: self._print("Filename : " + filename) for datum in tqdm(self.load_data_from_file(filename)): sentence, label = datum.split('\t') label = int(label) sentence = self.clean(sentence) data.append([sentence, label]) self.training = data[:self.training_cases] self.test = data[-self.testcases:] def load_data_from_file(self, filename, encoding="utf8"): with open(filename, encoding=encoding) as file: data = file.readlines() return data def get_kgrams(self, sentence, k=1): grams = list() for i in range(len(sentence)): grams.append(sentence[i:i + k]) if i + k >= len(sentence): break return grams def train(self): # try: # with open('frequency.pickle', "rb") as file: # self.frequency = pickle.load(file) # with open("count.pickle", "rb") as file: # self.positive_words, self.negative_words = pickle.load(file) # except Exception as error: # self.logger.debug("Frequency file not found") # self.train_unigrams() self.find_frequency_unigrams() self.train_from_negative_sentences() self.train_from_positive_sentences() # print(self.positive_words) # print(self.negative_words) # print(len(self.frequency)) self.find_probablility_unigrams() # print(len(self.probablility)) self.logger.info("Training completed") self.logger.info("Number of positive sentences : " + str(self.positive_sentence_count)) self.logger.info("Number of negative sentences : " + str(self.negative_sentence_count)) def classify(self, sentence): sentence = self.preprocess(sentence) positive_probablity = self.positive_sentence_count / self.total_sentences negative_probablity = self.negative_sentence_count / self.total_sentences self.logger.debug("sentence : " + str(sentence)) self.logger.debug("words considered : ") for word in sentence: word = word[0] word_positive_probability, word_negative_probability = 1, 1 if word in self.probablility: word_positive_probability, word_negative_probability = self.probablility[ word] self.logger.debug("word : " + word + " word_positive_probability : " + str(word_positive_probability) + " word_negative_probability : " + str(word_negative_probability)) positive_probablity *= word_positive_probability negative_probablity *= word_negative_probability self.logger.debug("positive_probablity : " + str(positive_probablity)) self.logger.debug("negative_probablity : " + str(negative_probablity)) # if abs(positive_probablity - negative_probablity) < 0.0000000000000001: # self.logger.debug("sentence is neutral") # return ("neutral" , -1) if positive_probablity > negative_probablity: self.logger.debug("sentence is positive") return ("positive", 1) if negative_probablity > positive_probablity: self.logger.debug("sentence is negative") return ("negative", 0) def test_classifier(self): correct, wrong = 0, 0 total = len(self.test) for sentence, actual_label in self.test: verdict, label = self.classify(sentence) if label == actual_label: correct += 1 else: wrong += 1 self.logger.info("correct : " + str(correct)) self.logger.info("wrong : " + str(wrong)) self.logger.info("total : " + str(total)) self.logger.info("accuracy : " + str(int((correct / total) * 100))) def get_stop_words(self): data = self.load_data_from_file('res\\eng_stop_words.txt') return set([datum.replace('\n', '') for datum in data]) def remove_stop_words(self, sentence): filtered_words = list() for word in sentence: if word in self.stop_words: continue filtered_words.append(word) return filtered_words def find_probablility_unigrams(self): self.probablility = dict() for word in self.frequency: positive_probablity = (self.frequency[word][0] + 1) / ( self.positive_words + len(self.frequency)) negative_probablity = (self.frequency[word][1] + 1) / ( self.negative_words + len(self.frequency)) self.probablility[word] = [ positive_probablity, negative_probablity ] def preprocess(self, sentence): sentence = self.tokenise(sentence) #sentence = self.remove_stop_words(sentence) sentence = self.get_kgrams(sentence, k=1) return sentence def train_from_negative_sentences(self): negative_files = ['res\\rt-polaritydata\\rt-polarity-neg.txt'] for filename in negative_files: new_sentences = self.load_data_from_file(filename) for sentence in new_sentences: sentence = self.preprocess(sentence) self.negative_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][1] += 1 self.negative_words += 1 def train_from_positive_sentences(self): positive_files = ['res\\rt-polaritydata\\rt-polarity-pos.txt'] data = list() for filename in positive_files: new_sentences = self.load_data_from_file(filename) for sentence in new_sentences: sentence = self.preprocess(sentence) self.positive_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][0] += 1 self.positive_words += 1 def find_frequency_unigrams(self): for sentence, label in self.training: self.total_sentences += 1 sentence = self.preprocess(sentence) if label == 1: #positive sentence self.positive_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][0] += 1 self.positive_words += 1 elif label == 0: #negative sentence self.negative_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][1] += 1 self.negative_words += 1
class NaiveBayes(): ''' implementation of Naive Bayes classifer ''' def __init__(self, verbose=True, test_set_count=500, no_of_grams=1): self.logger = Logger('NaiveBayes', 'logs\\NaiveBayes.log', is_verbose=verbose) self.verbose = verbose self.counts = dict() self.positive_bag = [] self.negative_bag = [] self.positve_test_bag = [] self.negative_test_bag = [] self.counts["test set"] = test_set_count self.counts["positive phrases"] = 0 self.counts["negative phrases"] = 0 self.counts["total sentences"] = 0 self.counts["positive sentences"] = 0 self.counts["negative sentences"] = 0 self.no_of_grams = no_of_grams self.phrase_occurrences = dict() self.phrase_probabilities = dict() def ready(self): self.logger.info("starting Naive Bayers classifier") self.load_data() self.create_test_set() self.fit() # self.find_accuracy() self.logger.info("Naive Bayers classifier ready.") def classify(self, sentence): ''' classifies a given sentence to positive or negative class ''' positive_probablity, negative_probablity = self.find_conditional_probability( sentence) # if positive_probablity == 1 and negative_probablity == 1: #unable to classify a sentence # self.logger.debug("sentence - " + sentence + " - is neutral") # return ("neutral", -1, positive_probablity) if positive_probablity == 1 and negative_probablity != 1: self.logger.info("sentence - " + sentence + " - is negative") return ("negative", 0, negative_probablity) if positive_probablity != 1 and negative_probablity == 1: self.logger.info("sentence - " + sentence + " - is positive") return ("positive", 1, positive_probablity) if positive_probablity > negative_probablity: self.logger.info("sentence - " + sentence + " - is positive") return ("positive", 1, positive_probablity) if negative_probablity > positive_probablity: self.logger.info("sentence - " + sentence + " - is negative") return ("negative", 0, negative_probablity) if negative_probablity == positive_probablity: #unable to classify a sentence self.logger.info("sentence - " + sentence + " - is neutral") self.logger.info("no sense can be deduced from this sentence") return ("neutral", -1, positive_probablity) def find_conditional_probability(self, sentence): ''' finds the conditional probablity for a given sentence from phrase_probabilities ''' sentence_str = sentence sentence = self.preprocess(sentence) sentence_positive_probablity = 1 sentence_negative_probablity = 1 positive_class_probability = self.counts[ "positive sentences"] / self.counts["total sentences"] negative_class_probability = self.counts[ "negative sentences"] / self.counts["total sentences"] sentence_positive_probablity *= positive_class_probability sentence_negative_probablity *= negative_class_probability kgrams = list() for k in range(self.no_of_grams, 0, -1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: #this give around 80% phrase = ' '.join(kgram) sentence = ' '.join(sentence) if phrase in sentence and phrase in self.phrase_probabilities: phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[ phrase] count = sentence.count(phrase) self.logger.info(phrase + " " + str(phrase_positive_probability) + " " + str(phrase_negative_probability) + " " + str(count)) sentence_positive_probablity *= phrase_positive_probability**count sentence_negative_probablity *= phrase_negative_probability**count sentence = sentence.replace(phrase, ' ') sentence = self.preprocess(sentence) # for kgram in kgrams: #this give 75% # phrase = ' '.join(kgram) # if phrase in self.phrase_probabilities: # phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[phrase] # self.logger.debug(phrase + " " + str(phrase_positive_probability) + " " + str(phrase_negative_probability)) # sentence_positive_probablity *= phrase_positive_probability # sentence_negative_probablity *= phrase_negative_probability return sentence_positive_probablity, sentence_negative_probablity def fit(self): ''' trains the model with sentences in positive and negative bags ''' self.logger.info("training started") self.logger.info("total sentences : " + str(self.counts["total sentences"])) self.logger.info("positive sentences : " + str(self.counts["positive sentences"])) self.logger.info("negative sentences : " + str(self.counts["negative sentences"])) self.get_occurrences_from_bags() self.logger.info("calculated occurrences") self.logger.info("unique phrases : " + str(len(self.phrase_occurrences))) self.logger.info("phrases in positive class : " + str(self.counts["positive phrases"])) self.logger.info("phrases in negative class : " + str(self.counts["negative phrases"])) self.get_conditional_probabilities() self.logger.info("conditional probality for phrases calculated") self.logger.info("training completed") def get_conditional_probabilities(self): ''' calculates the conditional probability for phrase|positive class and phrase|negative class ''' total_unique_phrases = len(self.phrase_occurrences) for phrase in self.phrase_occurrences: positive_probablity = (self.phrase_occurrences[phrase][0] + 1) / ( self.counts["positive phrases"] + total_unique_phrases) negative_probablity = (self.phrase_occurrences[phrase][1] + 1) / ( self.counts["negative phrases"] + total_unique_phrases) self.phrase_probabilities[phrase] = [ positive_probablity, negative_probablity ] def get_occurrences_from_bags(self): ''' calculates the occurrences of the phrases ''' self.get_occurrences_from_positive_bag() self.get_occurrences_from_negative_bag() def get_occurrences_from_positive_bag(self): ''' calculates the occurrences of unigram, bigram, trigram and quadgram from positive bag ''' for sentence in self.positive_bag: kgrams = list() for k in range(1, self.no_of_grams + 1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: phrase = ' '.join(kgram) self.counts["positive phrases"] += 1 if phrase not in self.phrase_occurrences: self.phrase_occurrences[phrase] = [ 0, 0 ] #[word occurrence in positive class, word occurrence in negative class] self.phrase_occurrences[phrase][0] += 1 def get_occurrences_from_negative_bag(self): ''' calculates the occurrences of unigram, bigram, trigram and quadgram from negative bag ''' for sentence in self.negative_bag: kgrams = list() for k in range(1, self.no_of_grams + 1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: phrase = ' '.join(kgram) self.counts["negative phrases"] += 1 if phrase not in self.phrase_occurrences: self.phrase_occurrences[phrase] = [0, 0] self.phrase_occurrences[phrase][1] += 1 def get_kgrams(self, sentence, k=1): ''' return list of kgrams from a given sentence ''' grams = list() for i in range(len(sentence)): grams.append(sentence[i:i + k]) if i + k >= len(sentence): break return grams def create_test_set(self): ''' randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences ''' from numpy import random as np_random count = self.counts["test set"] // 2 while (count != 0): index = np_random.random_integers(low=0, high=len(self.positive_bag) - 1) self.positve_test_bag.append(self.positive_bag.pop(index)) index = np_random.random_integers(low=0, high=len(self.negative_bag) - 1) self.negative_test_bag.append(self.negative_bag.pop(index)) count -= 1 self.logger.info("test sentences selected") self.logger.info( "Total sentences for testing : " + str(len(self.positve_test_bag) + len(self.negative_test_bag))) self.logger.info("positive sentences for testing : " + str(len(self.positve_test_bag))) self.logger.info("negative sentences for testing : " + str(len(self.negative_test_bag))) self.counts["positive sentences"] = len(self.positive_bag) self.counts["negative sentences"] = len(self.negative_bag) self.counts["total sentences"] = len(self.positive_bag) + len( self.negative_bag) def load_data(self): ''' loads the positive and negative sentences from filenames specified ''' mixed_bag_paths = [ 'res\\dataset\\uci_dataset\\yelp_labelled.txt', 'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt', 'res\\dataset\\uci_dataset\\imdb_labelled.txt' ] positive_bag_paths = [ 'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt' ] negative_bag_paths = [ 'res\\dataset\\polarity_dataset\\rt-polarity-neg.txt' ] count_positive, count_negative = 0, 0 for filename in mixed_bag_paths: for mixed_data in self.load_data_from_file(filename): sentence, label = mixed_data.split('\t') label = int(label) sentence = self.preprocess(sentence) if label == 1: #if sentence is positive self.positive_bag.append(sentence) count_positive += 1 else: self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from mixed bag imported") self.logger.debug("positive sentences : " + str(count_positive)) self.logger.debug("negative sentences : " + str(count_negative)) count_positive = 0 for filename in positive_bag_paths: for sentence in self.load_data_from_file(filename): sentence = self.preprocess(sentence) self.positive_bag.append(sentence) count_positive += 1 self.logger.debug("sentences from positive bag imported") self.logger.debug("positive sentences : " + str(count_positive)) count_negative = 0 for filename in negative_bag_paths: for sentence in self.load_data_from_file(filename): sentence = self.preprocess(sentence) self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from negative bag imported") self.logger.debug("negative sentences : " + str(count_negative)) self.counts["positive sentences"] = len(self.positive_bag) self.counts["negative sentences"] = len(self.negative_bag) self.counts["total sentences"] = len(self.positive_bag) + len( self.negative_bag) self.logger.info("sentences imported") self.logger.info("Total sentences : " + str(self.counts["total sentences"])) self.logger.info("positive sentences : " + str(self.counts["positive sentences"])) self.logger.info("negative sentences : " + str(self.counts["negative sentences"])) def load_data_from_file(self, filename, encoding="utf8"): ''' load the data as a list from the specified filename ''' with open(filename, encoding=encoding) as file: data = file.readlines() return data def preprocess(self, sentence): ''' preprocess the sentence and return as a list of words ''' sentence = self.tokenise(sentence) #sentence = self.remove_stop_words(sentence) return sentence def tokenise(self, sentence): ''' convert the sentence to list of words ''' sentence = self.clean(sentence) tokens = sentence.split(' ') filtered_tokens = list() for token in tokens: if len(token.strip()) != 0: filtered_tokens.append(token.strip()) return filtered_tokens def clean(self, sentence): ''' clean sentence by removing the ignored characters ''' ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?''' sentence = self.replace_characters(sentence, ignore_characters) return sentence.lower().strip() def replace_characters(self, text, characters): ''' replaces the specified characters in text with blank space ''' for char in characters: text = text.replace(char, ' ') return text def get_positive_test_bag(self): return self.positve_test_bag def get_negative_test_bag(self): return self.negative_test_bag def test_for_fish_guitar(self): positive_sentences = [ "fish smoked fish", "fish line", "fish haul smoked" ] negative_sentences = ["guitar jazz line"] self.positive_bag = [ sentence.split(" ") for sentence in positive_sentences ] self.negative_bag = [ sentence.split(" ") for sentence in negative_sentences ] self.counts["total sentences"] = len(self.positive_bag) + len( self.negative_bag) self.counts["positive sentences"] = len(self.positive_bag) self.counts["negative sentences"] = len(self.negative_bag) self.get_occurrences_from_bags() self.get_conditional_probabilities() test_sentence = "line guitar jazz jazz" result = self.classify(sentence=test_sentence) self.logger.info(str(result)) return result def find_accuracy(self): correct, wrong = 0, 0 total = len(self.positve_test_bag) + len(self.negative_test_bag) _correct, _wrong = self.test_for_bag(self.positve_test_bag, actual_result=1) correct += _correct wrong += _wrong _correct, _wrong = self.test_for_bag(self.negative_test_bag, actual_result=0) correct += _correct wrong += _wrong self.accuracy = (correct / total) * 100 self.logger.info("total test sentences : " + str(total)) self.logger.info("correct output : " + str(correct)) self.logger.info("wrong output : " + str(wrong)) self.logger.info("accuracy (%) : " + str(int(self.accuracy))) def test_for_bag(self, bag, actual_result): self.logger.is_verbose = False correct, wrong = 0, 0 for sentence in bag: sentence = ' '.join(sentence) result = self.classify(sentence=sentence) if result is None: self.logger.info("result is none : " + str(sentence)) wrong += 1 continue if result[1] == actual_result: correct += 1 else: wrong += 1 self.logger.is_verbose = True self.logger.debug("total test sentences in bag : " + str(len(bag))) self.logger.debug("correct output : " + str(correct)) self.logger.debug("wrong output : " + str(wrong)) self.logger.debug("accuracy (%) : " + str(int((correct / len(bag)) * 100))) return correct, wrong