class Classifier: def __init__(self, classifierName, posFile, negFile): self._name = classifierName pos = pd.read_table(posFile,delimiter='\n',header=None, names=["text"] ) pos['sentiment'] = 1 #1 for positive neg = pd.read_table(negFile,delimiter='\n',header=None, names=["text"] ) neg['sentiment'] = 2 #2 for negative pos_words=[] for s in pos['text']: short_p_words.extend(word_tokenize(str(s))) neg_words=[] for s in neg['text']: neg_words.extend(word_tokenize(str(s))) all_words=[] for w in pos_words: all_words.append(w.lower()) for w in neg_words: all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) self.word_features = list(all_words.keys())[:int(len(all_words)*0.8)] documents = pos.get_values() documents = np.concatenate((documents,neg.get_values()),axis=0) #shuffle the documents random.shuffler(documents) #prepare X and T, classification self.X = document[:,0:1] self.T = documents[:,1:2] if classifierName == 'NaiveBayesClassifier': self.classifier = nltk.NaiveBayesClassifier elif classifierName == 'MaxEntropy': classifier = nltk.MaxentClassifier elif classifierName == 'MultinomialNB': self.classifier = SklearnClassifier(MultinomialNB()) elif classifierName == 'BernoulliNB': self.classifier = SklearnClassifier(BernoulliNB()) elif classifierName == 'LogisticRegression': self.classifier = SklearnClassifier(LogisticRegression()) elif classifierName == 'SGDClassifier': self.classifier = SklearnClassifier(SGDClassifier()) elif classifierName == 'LinearSVC': self.classifier = SklearnClassifier(SGDClassifier()) elif classifierName == 'NuSVC': self.classifier = SklearnClassifier(SGDClassifier()) else: raise ValueError('Not a valid classifier name') def find_features(self,document): words = word_tokenize(document) features = {} for w in self.word_features: features[w] = (w in words) return features def train(self,Xtrain,numIterations = 100, algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]): print('Training the dataset') featuresets = [(self.find_features(rev), category) for (rev, category) in Xtrain] if self._name = 'NaiveBayesClassifier': self.classifer.train(featuresets) self.classifer.show_most_informative_features(15) elif self._name = 'MaxEntropy': classifier = nltk.MaxentClassifier.train(featuresets, algorithm, max_iter=numIterations) classifier.show_most_informative_features(10)
classifier = nltk.NaiveBayesClassifier.train(training_set) print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100) classifier.show_most_informative_features(15) ############### save_classifier = open("originalnaivebayes5k.pickle", "wb") pickle.dump(classifier, save_classifier) save_classifier.close() LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print(LogisticRegression_classifier) #print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) LogisticRegression_classifier.show_most_informative_features(15) save_classifier = open("LogisticRegression_classifier5k.pickle", "wb") pickle.dump(LogisticRegression_classifier, save_classifier) save_classifier.close() LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100) save_classifier = open("LinearSVC_classifier5k.pickle", "wb") pickle.dump(LinearSVC_classifier, save_classifier) save_classifier.close() ##NuSVC_classifier = SklearnClassifier(NuSVC())
class SentimentAnalyzer(): # @param safolder folder where the training set resides, and where to put the test result # @param model algorithm used for sentiment analysis def __init__(self, dir, model): self.dir = dir self.model = model self.preprocessor = Preprocessor(dir) # open existing files if a model has been built before. no need to reprocess self.classifier = pickle.load(open(f"{self.dir}/training/model/{model}_clf.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/{model}_clf.pickle") else None self.features = pickle.load(open(f"{self.dir}/training/model/features.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/features.pickle") else [] self.classes = pickle.load(open(f"{self.dir}/training/model/classes.pickle", "rb")) \ if os.path.isfile(f"{self.dir}/training/model/classes.pickle") else [] self.most_common_words = [w.strip() for w in open(f"{self.dir}/training/model/most_common_words.txt","r",encoding="utf8").readlines()] \ if os.path.isfile(f"{self.dir}/training/model/most_common_words.txt") else [] # remove most common words (top 1%) that appear in both positive and negative documents def _remove_most_common_words(self, documents): print("Define most common words...") most_common_words = set([]) for cls in self.classes: docs = [d for d in documents if d.sentiment == cls] doc_words = [ w for d in docs for w in word_tokenize(d.content.replace(".", "")) ] fdist = nltk.FreqDist(doc_words) if len(most_common_words) == 0: most_common_words = set([ w[0] for w in fdist.most_common(int(0.01 * len(doc_words))) ]) else: most_common_words = set([ w[0] for w in fdist.most_common(int(0.01 * len(doc_words))) if w[0] in most_common_words ]) self.most_common_words = most_common_words with open(f"{self.dir}/training/model/most_common_words.txt", "w", encoding="utf8") as writer: writer.writelines([f"{w}\n" for w in self.most_common_words]) ndocs = [] doc_count = 0 for d in documents: doc_count += 1 ncontent = " ".join([ w for w in word_tokenize(d.content) if w not in most_common_words ]) ndocs.append(Document(d.name, ncontent, d.sentiment, d.location)) print("\r", end="") print("Removing most common words progress", int(doc_count / len(documents) * 100), "%", end="", flush=True) print("") return ndocs # only keep adjectives, adverbs, and nouns def _reduce_dimension_by_postag(self, documents): reduced_documents = [] doc_count = 0 for doc in documents: reduced_sentence = " ".join([ p[0] for p in nltk.pos_tag( word_tokenize(doc.content.replace(".", ""))) if p[1] in preprocess.ADJ or p[1] in preprocess.ADV or p[1] in preprocess.NOUN ]) if not reduced_sentence.isspace(): reduced_documents.append( Document(doc.name, reduced_sentence, doc.sentiment, doc.location)) doc_count += 1 print("\r", end="") print("Reducing dimension in progress", int(doc_count * 100 / len(documents)), "%", end="", flush=True) print("") return reduced_documents def create_frequency_plot(self, words, top_k): p = nltk.FreqDist(words) p.plot(top_k) def _undersample(self, documents): # find the minimum number of documents in a class docs_by_class = [] minclass_length = len(documents) for cls in self.classes: docs = [d for d in documents if d.sentiment == cls] docs_by_class.append(docs) if len(docs) < minclass_length: minclass_length = len(docs) # sample all classes based on the minimum number of documents undersampled_docs = [] for docs in docs_by_class: random.shuffle(docs) undersampled_docs.extend(docs[:minclass_length]) return undersampled_docs # preprocessing def prepare_documents(self): documents = [] for file in os.listdir(f"{self.dir}/training/data"): documents.extend( pickle.load(open(f"{self.dir}/training/data/{file}", "rb"))) if len(self.classes) == 0: self.classes = set([doc.sentiment for doc in documents]) pickle.dump( self.classes, open(f"{self.dir}/training/model/classes.pickle", "wb")) print("Perform undersampling...") documents = self._undersample(documents) documents = self._reduce_dimension_by_postag(documents) documents = self._remove_most_common_words(documents) return documents def transform_into_featuresets(self, documents): self.features = set( [w for d in documents for w in set(word_tokenize(d.content))]) pickle.dump(self.features, open(f"{self.dir}/training/model/features.pickle", "wb")) print("Features length:", len(self.features)) featuresets = [] print("Transforming into featuresets....") doc_count = 0 for doc in documents: # checking whether a word exists in an array takes a significantly longer time # thus we check whether a word exists in a string featuresets.append(({ w: True for w in word_tokenize(doc.content) if w in self.features }, doc.sentiment)) doc_count += 1 print("\r", end='') print("Preparing featureset in progress", int(doc_count * 100 / len(documents)), "%", end='', flush=True) print("") return featuresets def get_training_validation_set(self, featuresets, valid_ratio): if len(self.classes) == 0: classes = set([f[1] for f in featuresets]) pickle.dump( self.classes, open(f"{self.dir}/training/model/classes.pickle", "wb")) trainingset = [] validset = [] for c in self.classes: subfeat = [f for f in featuresets if f[1] == c] random.shuffle(subfeat) trainct = int((1 - valid_ratio) * len(subfeat)) trainingset.extend(subfeat[:trainct]) validset.extend(subfeat[trainct:]) return trainingset, validset def train(self, validation_ratio): os.makedirs(os.path.dirname(f"{self.dir}/training/model/"), exist_ok=True) documents = self.prepare_documents() featuresets = self.transform_into_featuresets(documents) trainset, validset = self.get_training_validation_set( featuresets, validation_ratio) print("Building classifier...") if self.model == "NB": self.classifier = nltk.NaiveBayesClassifier.train(trainset) self.classifier.show_most_informative_features(15) elif self.model == "MNB": self.classifier = SklearnClassifier( MultinomialNB()).train(trainset) elif self.model == "SVM": self.classifier = SklearnClassifier(SVC()).train(trainset) elif self.model == "LR": self.classifier = SklearnClassifier( LogisticRegression()).train(trainset) print("Accuracy per class") for cls in self.classes: print(f"{cls} accuracy:", (nltk.classify.accuracy( self.classifier, [v for v in validset if v[1] == cls])) * 100) print("Classifier accuracy percent:", (nltk.classify.accuracy(self.classifier, validset)) * 100) pickle.dump( self.classifier, open(f"{self.dir}/training/model/{self.model}_clf.pickle", "wb")) def show_most_informative_features(self, n): self.classifier.show_most_informative_features(n) def sentiment(self, text): # to ensure that the word is lemmatized properly so it is detected in self.features cleaned_text = self.preprocessor.basic_preprocess(text).replace( ".", "") # no need advanced self processing because the features have been determined feature = { w: True for w in word_tokenize(cleaned_text) if w in self.features } prob_dict = self.classifier.prob_classify(feature) cls = prob_dict.max() prob = prob_dict.prob(cls) return cls, prob def classify(self, test_dir): print("Start classifying...") if self.classifier == None: self.train(0.2) else: self.classifier.show_most_informative_features(15) files = [ os.path.basename(x) for x in glob.glob(f"{self.dir}/{test_dir}/data/*.csv") ] done_files = [f.strip() for f in open(f"{self.dir}/testing/classify_done.txt", 'r').readlines()] \ if os.path.isfile(f"{self.dir}/testing/classify_done.txt") else [] tbp_files = [f for f in files if f not in done_files] headers = [ "review_page", "review_title", "review_content", "review_star", "reviewer_location", "review_date", "crawled_date" ] os.makedirs(os.path.dirname(f"{self.dir}/{test_dir}/results/"), exist_ok=True) for file in tbp_files: with open(f"{self.dir}/{test_dir}/data/{file}", "r", encoding="utf8") as f: csvreader = csv.DictReader(f) with open(f"{self.dir}/{test_dir}/results/{file}","w", encoding="utf8", newline="") \ as w: csvwriter = csv.writer(w) csvwriter.writerow(headers) rowid = 0 rownum = self.preprocessor.count_lines( f"{self.dir}/{test_dir}/data/{file}") for row in csvreader: review_page = row["review_page"] review_title = row["review_title"] review_content = row["review_content"] cat = self.sentiment( f"{row['review_title']}. {row['review_content']}") review_star = "45" if cat[0] == "pos" else "20" reviewer_location = row["user_location"] review_date = row["review_date"] crawled_date = "00000000" csvwriter.writerow([ review_page, review_title, review_content, review_star, reviewer_location, review_date, crawled_date ]) w.flush() rowid += 1 print("\r", end='') print("Classifying in progress", int(rowid * 100 / rownum), "% for", file, end='', flush=True) with open(f"{self.dir}/testing/classify_done.txt", "a", encoding="utf8") as writer: writer.write(f"{file}\n")