예제 #1
0
class Classifier:
    
    def __init__(self, classifierName, posFile, negFile):
        
        self._name = classifierName
        
        pos = pd.read_table(posFile,delimiter='\n',header=None, names=["text"] )
        pos['sentiment'] = 1 #1 for positive

        neg = pd.read_table(negFile,delimiter='\n',header=None, names=["text"] )
        neg['sentiment'] = 2 #2 for negative
        
        pos_words=[]
        for s in pos['text']:
           short_p_words.extend(word_tokenize(str(s)))

        neg_words=[]
        for s in neg['text']:
            neg_words.extend(word_tokenize(str(s)))

        all_words=[]
        for w in pos_words:
            all_words.append(w.lower())

        for w in neg_words:
            all_words.append(w.lower())

            
        all_words = nltk.FreqDist(all_words)

        self.word_features = list(all_words.keys())[:int(len(all_words)*0.8)]
        
        documents = pos.get_values()
        documents = np.concatenate((documents,neg.get_values()),axis=0)

        #shuffle the documents
        random.shuffler(documents)

        #prepare X and T, classification
        self.X = document[:,0:1]
        self.T = documents[:,1:2]
        
        if classifierName == 'NaiveBayesClassifier':
            self.classifier = nltk.NaiveBayesClassifier
        elif classifierName == 'MaxEntropy':
            classifier = nltk.MaxentClassifier
        elif classifierName == 'MultinomialNB':
            self.classifier = SklearnClassifier(MultinomialNB())
        elif classifierName == 'BernoulliNB':
            self.classifier = SklearnClassifier(BernoulliNB())
        elif classifierName == 'LogisticRegression':
            self.classifier = SklearnClassifier(LogisticRegression())
        elif classifierName == 'SGDClassifier':
            self.classifier = SklearnClassifier(SGDClassifier())
        elif classifierName == 'LinearSVC':
            self.classifier = SklearnClassifier(SGDClassifier())
        elif classifierName == 'NuSVC':
            self.classifier = SklearnClassifier(SGDClassifier())
        else:
            raise ValueError('Not a valid classifier name')
        
    
    def find_features(self,document):
        words = word_tokenize(document)
        features = {}
        for w in self.word_features:
            features[w] = (w in words)

        return features
    
    def train(self,Xtrain,numIterations = 100, algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]):
        
        print('Training the dataset')
        
        featuresets = [(self.find_features(rev), category) for (rev, category) in Xtrain]
        
        if self._name = 'NaiveBayesClassifier':
            self.classifer.train(featuresets)
            self.classifer.show_most_informative_features(15)
        elif self._name = 'MaxEntropy':
            classifier = nltk.MaxentClassifier.train(featuresets, algorithm, max_iter=numIterations)
            classifier.show_most_informative_features(10)
예제 #2
0
파일: test8.py 프로젝트: hutink/PFE
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:",
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

###############
save_classifier = open("originalnaivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print(LogisticRegression_classifier)
#print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
LogisticRegression_classifier.show_most_informative_features(15)

save_classifier = open("LogisticRegression_classifier5k.pickle", "wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:",
      (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

save_classifier = open("LinearSVC_classifier5k.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

##NuSVC_classifier = SklearnClassifier(NuSVC())
예제 #3
0
class SentimentAnalyzer():

    # @param safolder folder where the training set resides, and where to put the test result
    # @param model algorithm used for sentiment analysis
    def __init__(self, dir, model):
        self.dir = dir
        self.model = model
        self.preprocessor = Preprocessor(dir)

        # open existing files if a model has been built before. no need to reprocess
        self.classifier = pickle.load(open(f"{self.dir}/training/model/{model}_clf.pickle", "rb")) \
            if os.path.isfile(f"{self.dir}/training/model/{model}_clf.pickle") else None
        self.features = pickle.load(open(f"{self.dir}/training/model/features.pickle", "rb")) \
            if os.path.isfile(f"{self.dir}/training/model/features.pickle") else []
        self.classes = pickle.load(open(f"{self.dir}/training/model/classes.pickle", "rb")) \
            if os.path.isfile(f"{self.dir}/training/model/classes.pickle") else []
        self.most_common_words = [w.strip() for w in open(f"{self.dir}/training/model/most_common_words.txt","r",encoding="utf8").readlines()] \
            if os.path.isfile(f"{self.dir}/training/model/most_common_words.txt") else []

    # remove most common words (top 1%) that appear in both positive and negative documents
    def _remove_most_common_words(self, documents):
        print("Define most common words...")

        most_common_words = set([])
        for cls in self.classes:
            docs = [d for d in documents if d.sentiment == cls]

            doc_words = [
                w for d in docs
                for w in word_tokenize(d.content.replace(".", ""))
            ]
            fdist = nltk.FreqDist(doc_words)

            if len(most_common_words) == 0:
                most_common_words = set([
                    w[0] for w in fdist.most_common(int(0.01 * len(doc_words)))
                ])
            else:
                most_common_words = set([
                    w[0] for w in fdist.most_common(int(0.01 * len(doc_words)))
                    if w[0] in most_common_words
                ])

        self.most_common_words = most_common_words
        with open(f"{self.dir}/training/model/most_common_words.txt",
                  "w",
                  encoding="utf8") as writer:
            writer.writelines([f"{w}\n" for w in self.most_common_words])

        ndocs = []
        doc_count = 0
        for d in documents:
            doc_count += 1
            ncontent = " ".join([
                w for w in word_tokenize(d.content)
                if w not in most_common_words
            ])

            ndocs.append(Document(d.name, ncontent, d.sentiment, d.location))
            print("\r", end="")
            print("Removing most common words progress",
                  int(doc_count / len(documents) * 100),
                  "%",
                  end="",
                  flush=True)
        print("")
        return ndocs

    # only keep adjectives, adverbs, and nouns
    def _reduce_dimension_by_postag(self, documents):
        reduced_documents = []

        doc_count = 0
        for doc in documents:
            reduced_sentence = " ".join([
                p[0] for p in nltk.pos_tag(
                    word_tokenize(doc.content.replace(".", "")))
                if p[1] in preprocess.ADJ or p[1] in preprocess.ADV
                or p[1] in preprocess.NOUN
            ])

            if not reduced_sentence.isspace():
                reduced_documents.append(
                    Document(doc.name, reduced_sentence, doc.sentiment,
                             doc.location))

            doc_count += 1
            print("\r", end="")
            print("Reducing dimension in progress",
                  int(doc_count * 100 / len(documents)),
                  "%",
                  end="",
                  flush=True)
        print("")

        return reduced_documents

    def create_frequency_plot(self, words, top_k):
        p = nltk.FreqDist(words)
        p.plot(top_k)

    def _undersample(self, documents):

        # find the minimum number of documents in a class

        docs_by_class = []
        minclass_length = len(documents)
        for cls in self.classes:
            docs = [d for d in documents if d.sentiment == cls]
            docs_by_class.append(docs)

            if len(docs) < minclass_length:
                minclass_length = len(docs)

        # sample all classes based on the minimum number of documents
        undersampled_docs = []
        for docs in docs_by_class:
            random.shuffle(docs)
            undersampled_docs.extend(docs[:minclass_length])

        return undersampled_docs

    # preprocessing
    def prepare_documents(self):
        documents = []

        for file in os.listdir(f"{self.dir}/training/data"):
            documents.extend(
                pickle.load(open(f"{self.dir}/training/data/{file}", "rb")))

        if len(self.classes) == 0:
            self.classes = set([doc.sentiment for doc in documents])
            pickle.dump(
                self.classes,
                open(f"{self.dir}/training/model/classes.pickle", "wb"))

        print("Perform undersampling...")
        documents = self._undersample(documents)

        documents = self._reduce_dimension_by_postag(documents)

        documents = self._remove_most_common_words(documents)

        return documents

    def transform_into_featuresets(self, documents):

        self.features = set(
            [w for d in documents for w in set(word_tokenize(d.content))])
        pickle.dump(self.features,
                    open(f"{self.dir}/training/model/features.pickle", "wb"))
        print("Features length:", len(self.features))
        featuresets = []

        print("Transforming into featuresets....")
        doc_count = 0
        for doc in documents:
            # checking whether a word exists in an array takes a significantly longer time
            # thus we check whether a word exists in a string
            featuresets.append(({
                w: True
                for w in word_tokenize(doc.content) if w in self.features
            }, doc.sentiment))
            doc_count += 1

            print("\r", end='')
            print("Preparing featureset in progress",
                  int(doc_count * 100 / len(documents)),
                  "%",
                  end='',
                  flush=True)
        print("")

        return featuresets

    def get_training_validation_set(self, featuresets, valid_ratio):
        if len(self.classes) == 0:
            classes = set([f[1] for f in featuresets])
            pickle.dump(
                self.classes,
                open(f"{self.dir}/training/model/classes.pickle", "wb"))

        trainingset = []
        validset = []

        for c in self.classes:
            subfeat = [f for f in featuresets if f[1] == c]
            random.shuffle(subfeat)

            trainct = int((1 - valid_ratio) * len(subfeat))
            trainingset.extend(subfeat[:trainct])
            validset.extend(subfeat[trainct:])

        return trainingset, validset

    def train(self, validation_ratio):
        os.makedirs(os.path.dirname(f"{self.dir}/training/model/"),
                    exist_ok=True)

        documents = self.prepare_documents()
        featuresets = self.transform_into_featuresets(documents)
        trainset, validset = self.get_training_validation_set(
            featuresets, validation_ratio)

        print("Building classifier...")
        if self.model == "NB":
            self.classifier = nltk.NaiveBayesClassifier.train(trainset)
            self.classifier.show_most_informative_features(15)
        elif self.model == "MNB":
            self.classifier = SklearnClassifier(
                MultinomialNB()).train(trainset)
        elif self.model == "SVM":
            self.classifier = SklearnClassifier(SVC()).train(trainset)
        elif self.model == "LR":
            self.classifier = SklearnClassifier(
                LogisticRegression()).train(trainset)

        print("Accuracy per class")
        for cls in self.classes:
            print(f"{cls} accuracy:", (nltk.classify.accuracy(
                self.classifier, [v for v in validset if v[1] == cls])) * 100)
        print("Classifier accuracy percent:",
              (nltk.classify.accuracy(self.classifier, validset)) * 100)
        pickle.dump(
            self.classifier,
            open(f"{self.dir}/training/model/{self.model}_clf.pickle", "wb"))

    def show_most_informative_features(self, n):
        self.classifier.show_most_informative_features(n)

    def sentiment(self, text):
        # to ensure that the word is lemmatized properly so it is detected in self.features
        cleaned_text = self.preprocessor.basic_preprocess(text).replace(
            ".", "")

        # no need advanced self processing because the features have been determined
        feature = {
            w: True
            for w in word_tokenize(cleaned_text) if w in self.features
        }
        prob_dict = self.classifier.prob_classify(feature)

        cls = prob_dict.max()
        prob = prob_dict.prob(cls)

        return cls, prob

    def classify(self, test_dir):
        print("Start classifying...")

        if self.classifier == None:
            self.train(0.2)
        else:
            self.classifier.show_most_informative_features(15)

        files = [
            os.path.basename(x)
            for x in glob.glob(f"{self.dir}/{test_dir}/data/*.csv")
        ]
        done_files = [f.strip() for f in open(f"{self.dir}/testing/classify_done.txt", 'r').readlines()] \
            if os.path.isfile(f"{self.dir}/testing/classify_done.txt") else []
        tbp_files = [f for f in files if f not in done_files]

        headers = [
            "review_page", "review_title", "review_content", "review_star",
            "reviewer_location", "review_date", "crawled_date"
        ]
        os.makedirs(os.path.dirname(f"{self.dir}/{test_dir}/results/"),
                    exist_ok=True)

        for file in tbp_files:
            with open(f"{self.dir}/{test_dir}/data/{file}",
                      "r",
                      encoding="utf8") as f:
                csvreader = csv.DictReader(f)

                with open(f"{self.dir}/{test_dir}/results/{file}","w", encoding="utf8", newline="") \
                        as w:
                    csvwriter = csv.writer(w)
                    csvwriter.writerow(headers)

                    rowid = 0
                    rownum = self.preprocessor.count_lines(
                        f"{self.dir}/{test_dir}/data/{file}")
                    for row in csvreader:
                        review_page = row["review_page"]
                        review_title = row["review_title"]
                        review_content = row["review_content"]

                        cat = self.sentiment(
                            f"{row['review_title']}. {row['review_content']}")
                        review_star = "45" if cat[0] == "pos" else "20"

                        reviewer_location = row["user_location"]
                        review_date = row["review_date"]
                        crawled_date = "00000000"

                        csvwriter.writerow([
                            review_page, review_title, review_content,
                            review_star, reviewer_location, review_date,
                            crawled_date
                        ])

                        w.flush()

                        rowid += 1
                        print("\r", end='')
                        print("Classifying in progress",
                              int(rowid * 100 / rownum),
                              "% for",
                              file,
                              end='',
                              flush=True)

            with open(f"{self.dir}/testing/classify_done.txt",
                      "a",
                      encoding="utf8") as writer:
                writer.write(f"{file}\n")