def get_corpus(self): """ Lire et peupler le corpus """ if self.corpus is None: self.corpus = Dictionary() self.corpus.updated = time.time() try: directory = Paths.get_root_dir(*CORPUS_PATH) infile = '{name}.csv'.format(name=self.pathname) path = join(directory, '{name}.csv.zip'.format(name=self.pathname)) # Lire le CSV dans le fichier zip with ZipFile(open(path, 'rb')) as zipfile: buffer = StringIO(zipfile.read(infile)) reader = csv.reader(buffer) for row in reader: # 0: category, 1: doc, 2: hash self.corpus[row[2]] = (row[0], row[1]) except IOError: pass if self.corpus_shadow is None or self.corpus_shadow.updated < self.corpus.updated: self.corpus_shadow = List(self.corpus.values()) self.corpus_shadow.updated = time.time() self.classifier = MaxEntClassifier( self.corpus_shadow, feature_extractor=extractor_base) # ou NaiveBayesClassifier return self.corpus_shadow
class TestMaxEntClassifier(unittest.TestCase): def setUp(self): self.classifier = MaxEntClassifier(train_set) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), 'positive') assert_true(res.prob("positive") > res.prob("negative"))
class _Classifier: def __init__(self, data): # self._classifier = NaiveBayesClassifier(data) self._classifier = MaxEntClassifier(data) def update(self, data): """ data needs to be an iterable(list or tuple) of iterables the inner iterable needs to be in the format of ('str', 'label') """ self._classifier.update(data) def probability(self, text): return self._classifier.prob_classify(text) def labels(self): return self._classifier.labels()
def add_global_hook(): tweets = TweetBank(50) train, test = tweets.data_set() naive_bayes = NaiveBayesClassifier(train) maxent = MaxEntClassifier(train) classifier_dictionary = {"Naive Bayes": naive_bayes, "Maxent": maxent} g = web.storage({ "classifier_dictionary": classifier_dictionary, "test_set": test }) def _wrapper(handler): web.ctx.globals = g return handler() return _wrapper
def __init__(self, data): # self._classifier = NaiveBayesClassifier(data) self._classifier = MaxEntClassifier(data)
def setUp(self): self.classifier = MaxEntClassifier(train_set)
from textblob.classifiers import MaxEntClassifier with open('data/train-toy.csv', 'r') as fp: cl = MaxEntClassifier(fp, format="csv") with open('data/test-toy.csv', 'r') as gp: print cl.accuracy(gp, format="csv")
print(len(words), len(tags)) for i in range(1000): if (i < 800): temp = (words[i], tags[i]) train.append(temp) else: temp = (words[i], tags[i]) test.append(temp) print(train) print(test) naive = NaiveBayesClassifier(train) dtc = DecisionTreeClassifier(train) mec = MaxEntClassifier(train) print("NaiveBayesClassifier Accuracy: {0}".format(naive.accuracy(test))) print("DecisionTreeClassifier Accuracy: {0}".format(dtc.accuracy(test))) print("MaxEntClassifier Accuracy: {0}".format(mec.accuracy(test))) cl = NaiveBayesClassifier(train) print("NaiveBayesClassifier Accuracy: {0}".format(cl.accuracy(test))) for i in range(0, len(test)): tag = cl.classify(test[i]) pred_tags.append(tag) if (tag == test_tags[i]): count += 1 print(len(pred_tags), len(test_tags)) print(count)
def search_department(job, train): cl_depart = MaxEntClassifier(train) prob_dist = cl_depart.prob_classify(job) print(prob_dist.max()) return prob_dist.max()
# trains.append(train[i]) trains = train if choice == "1": print("\n" + "#NaiveBayesClassifier") cl1 = NaiveBayesClassifier(trains) print("Classifier: Naive Bayes -- Accuracy: ", cl1.accuracy(test), "\n") elif choice == "2": print("\n" + "#DecisionTreeClassifier") cl2 = DecisionTreeClassifier(trains) print("Classifier: Decision Tree -- Accuracy: ", cl2.accuracy(test), "\n") elif choice == "3": print("\n" + "#MaxEntClassifier") cl3 = MaxEntClassifier(trains) print("Classifier: Maximum Entropy -- Accuracy: ", cl3.accuracy(test), "\n") elif choice == "4": print("\n" + "#NLTKClassifier") cl4 = NLTKClassifier(trains) print("Classifier: NLTK -- Accuracy: ", cl4.accuracy(test), "\n") else: print("Bad input!") # most repeated words (most important properties) totalDictPosSorted = sorted(totalDictPos.items(), key=operator.itemgetter(1)) totalDictNegSorted = sorted(totalDictNeg.items(), key=operator.itemgetter(1))
class FileCorpus(BaseCorpus): """ Corpus stocké dans un fichier texte CSV compressé J'estime qu'un corpus de 2↑15 (32768) documents devrait être utilisable avec cette classe de corpus. Dans le cas contraire, il faudra penser à développer son propre module CFFI. """ # Attributs corpus = None # de type Dictionary (dictionnaire auquel on peut assigner des attributs) corpus_shadow = None # copie de type List (les classifieurs NLTK utilisant des listes) classifier = None # classifieur NLTK, initialisé dans get_corpus # Getter def get_corpus(self): """ Lire et peupler le corpus """ if self.corpus is None: self.corpus = Dictionary() self.corpus.updated = time.time() try: directory = Paths.get_root_dir(*CORPUS_PATH) infile = '{name}.csv'.format(name=self.pathname) path = join(directory, '{name}.csv.zip'.format(name=self.pathname)) # Lire le CSV dans le fichier zip with ZipFile(open(path, 'rb')) as zipfile: buffer = StringIO(zipfile.read(infile)) reader = csv.reader(buffer) for row in reader: # 0: category, 1: doc, 2: hash self.corpus[row[2]] = (row[0], row[1]) except IOError: pass if self.corpus_shadow is None or self.corpus_shadow.updated < self.corpus.updated: self.corpus_shadow = List(self.corpus.values()) self.corpus_shadow.updated = time.time() self.classifier = MaxEntClassifier( self.corpus_shadow, feature_extractor=extractor_base) # ou NaiveBayesClassifier return self.corpus_shadow def classify(self, document): """ Renvoyer la catégorie la plus probable pour un document :rtype: str """ self.get_corpus() return self.classifier.classify(document) def classify_prob(self, document): """ Renvoyer les probabilités de catégorie :rtype: nltk.probability.DictionaryProbDist """ self.get_corpus() return self.classifier.prob_classify(document) # Actions def save(self): """ Enregistrer le corpus sur disque :rtype: bool :returns: True si la sauvegarde a eu lieu, False sinon """ directory = Paths.get_root_dir(*CORPUS_PATH) infile = '{name}.csv'.format(name=self.pathname) path = join(directory, '{name}.csv.zip'.format(name=self.pathname)) # Écrire le CSV dans le fichier zip try: with ZipFile(path, 'w', ZIP_DEFLATED) as zipfile: buffer = StringIO() writer = csv.writer(buffer, delimiter=",", encoding='utf-8') for row in self.corpus_shadow: writer.writerow(row) zipfile.writestr(infile, buffer.getvalue()) return True except IOError: return False def train(self, document, category): """ Classer un document dans une catégorie :returns: signature du document :rtype: long """ self.get_corpus() document = format_base(document) document_shadow = [document] analyzer_default_format.send( FileCorpus, document_shadow, category ) # On passe une liste car est modifiable par les listeners document = "".join(document_shadow) signature = hash(document) self.corpus[signature] = (document, category) self.corpus.updated = time.time() return signature def retrain(self, signature, category): """ Changer la catégorie d'un document déjà classifié :param signature: hash du document à reclassifier :param category: nouvelle catégorie du document """ self.get_corpus() if self.corpus[signature][1] != category: self.corpus[signature] = (self.corpus[signature][0], category) self.corpus.updated = time.time() return True return False def untrain(self, signature): """ Retirer du corpus :param signature: Hash du document """ self.get_corpus() extracted = self.corpus.pop(signature, None) self.corpus.updated = time.time() return extracted is not None # Overrides def __init__(self, pathname, *args, **kwargs): """ Initialiser le corpus avec son nom de fichier :param pathname: nom de fichier du corpus sans répertoire et extension """ self.pathname = pathname
print('Before pre-processing \n') cl = DecisionTreeClassifier(training_array) classify_review(cl) print('\n After removing stop-words \n') cl = DecisionTreeClassifier(training_array_without_sw) classify_review(cl) print('\n After stemming \n') cl = DecisionTreeClassifier(training_array_stemmed_without_sw) classify_review(cl) print('\n ************ NaiveBayesClassifier ********************\n') print('Before pre-processing\n') cl = NaiveBayesClassifier(training_array) classify_review(cl) print('\n After removing stop-words \n') cl = NaiveBayesClassifier(training_array_without_sw) classify_review(cl) print('\n After stemming \n') cl = NaiveBayesClassifier(training_array_stemmed_without_sw) classify_review(cl) print('\n ************ MaxEntClassifier ********************\n') cl= MaxEntClassifier(training_array) print('Before pre-processing\n') classify_review(cl) print('\n After removing stop-words \n') cl = MaxEntClassifier(training_array_without_sw) classify_review(cl) print('\n After stemming \n') cl = MaxEntClassifier(training_array_stemmed_without_sw) classify_review(cl)