def train(self): pos_train = [{} for f in self.pos_train_data] neg_train = [{} for f in self.neg_train_data] # Reading files for (j, lim) in zip(self.n, self.limit): all_grams = [ ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data ] for i in range(len(self.pos_train_data)): pos_train[i].update(all_grams[i]) featureslist = all_grams all_grams = [ ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data ] for i in range(len(self.neg_train_data)): neg_train[i].update(all_grams[i]) featureslist.extend(all_grams) # Collapsing, limiting ngrams self.features.update( ngrams.top_ngrams(ngrams.collapse_ngrams(featureslist), lim)) # Creating Index self.classifier = self.clsf(restrictFeatures=self.features) print "# features: %s" % self.classifier.nfeatures if self.idf: print "Using TF-IDF" idf = ngrams.ngrams_to_idf(pos_train + neg_train) for i in range(len(pos_train)): for j in pos_train[i]: pos_train[i][j] = pos_train[i][j] * idf[j] for i in range(len(neg_train)): for j in neg_train[i]: neg_train[i][j] = neg_train[i][j] * idf[j] # Making classifier for i in pos_train: self.count += 1 self.classifier.addFeatureVector(i, 1, binary=self.binary) for i in neg_train: self.classifier.addFeatureVector(i, -1, binary=self.binary) self.classifier.compile()
def train(self): pos_train = [{} for f in self.pos_train_data] neg_train = [{} for f in self.neg_train_data] # Reading files for (j,lim) in zip(self.n,self.limit): all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data] for i in range(len(self.pos_train_data)): pos_train[i].update(all_grams[i]) featureslist = all_grams all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data] for i in range(len(self.neg_train_data)): neg_train[i].update(all_grams[i]) featureslist.extend(all_grams) # Collapsing, limiting ngrams self.features.update(ngrams.top_ngrams(ngrams.collapse_ngrams( featureslist),lim)) # Creating Index self.classifier = self.clsf(restrictFeatures = self.features) print "# features: %s" % self.classifier.nfeatures if self.idf: print "Using TF-IDF" idf = ngrams.ngrams_to_idf(pos_train + neg_train) for i in range(len(pos_train)): for j in pos_train[i]: pos_train[i][j] = pos_train[i][j] * idf[j] for i in range(len(neg_train)): for j in neg_train[i]: neg_train[i][j] = neg_train[i][j] * idf[j] # Making classifier for i in pos_train: self.count += 1 self.classifier.addFeatureVector(i, 1, binary=self.binary) for i in neg_train: self.classifier.addFeatureVector(i, -1, binary=self.binary) self.classifier.compile()
import matplotlib.pyplot as plt from classifier import MaximumEntropyClassifier TRAIN_SIZE = 800 n = 1 print "Maximum Entropy" pos = os.listdir("pos") neg = os.listdir("neg") ind = Indexes('r',1,TRAIN_SIZE) print "> determined Indices" ind.next() pos_grams = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()] pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams),16165) neg_grams = [ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()] neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams),16165) print "> collapsed grams" trainingset = [([k],'pos',v) for (k,v) in pos_collapsed_grams.iteritems()] trainingset.extend([([k],'neg',v) for (k,v) in neg_collapsed_grams.iteritems()]) m = MaximumEntropyClassifier(trainingset) print "> created model" pos_res = [] neg_res = [] pos_tests = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_test_ind()] pos_results = [m.classify(test) for test in pos_tests] pos_correct = len([i for i in pos_results if i >= 0.5]) print "Positive: %s of %s, %s accuracy" % (pos_correct,len(pos_tests),(float(pos_correct)/len(pos_tests)))
n = 1 print "Maximum Entropy" pos = os.listdir("pos") neg = os.listdir("neg") ind = Indexes('r', 1, TRAIN_SIZE) print "> determined Indices" ind.next() pos_grams = [ ngrams.ngrams(n, open("pos/" + pos[i]).read()) for i in ind.get_pos_train_ind() ] pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams), 16165) neg_grams = [ ngrams.ngrams(n, open("neg/" + neg[i]).read()) for i in ind.get_neg_train_ind() ] neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams), 16165) print "> collapsed grams" trainingset = [([k], 'pos', v) for (k, v) in pos_collapsed_grams.iteritems()] trainingset.extend([([k], 'neg', v) for (k, v) in neg_collapsed_grams.iteritems()]) m = MaximumEntropyClassifier(trainingset) print "> created model"