示例#1
0
    def train(self):
        pos_train = [{} for f in self.pos_train_data]
        neg_train = [{} for f in self.neg_train_data]

        # Reading files
        for (j, lim) in zip(self.n, self.limit):
            all_grams = [
                ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data
            ]
            for i in range(len(self.pos_train_data)):
                pos_train[i].update(all_grams[i])
            featureslist = all_grams

            all_grams = [
                ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data
            ]
            for i in range(len(self.neg_train_data)):
                neg_train[i].update(all_grams[i])
            featureslist.extend(all_grams)

            # Collapsing, limiting ngrams
            self.features.update(
                ngrams.top_ngrams(ngrams.collapse_ngrams(featureslist), lim))

        # Creating Index
        self.classifier = self.clsf(restrictFeatures=self.features)
        print "# features: %s" % self.classifier.nfeatures

        if self.idf:
            print "Using TF-IDF"
            idf = ngrams.ngrams_to_idf(pos_train + neg_train)
            for i in range(len(pos_train)):
                for j in pos_train[i]:
                    pos_train[i][j] = pos_train[i][j] * idf[j]
            for i in range(len(neg_train)):
                for j in neg_train[i]:
                    neg_train[i][j] = neg_train[i][j] * idf[j]

        # Making classifier
        for i in pos_train:
            self.count += 1
            self.classifier.addFeatureVector(i, 1, binary=self.binary)
        for i in neg_train:
            self.classifier.addFeatureVector(i, -1, binary=self.binary)
        self.classifier.compile()
示例#2
0
    def train(self):
        pos_train = [{} for f in self.pos_train_data]
        neg_train = [{} for f in self.neg_train_data]
        
        # Reading files
        for (j,lim) in zip(self.n,self.limit):
            all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data]
            for i in range(len(self.pos_train_data)):
                pos_train[i].update(all_grams[i])
            featureslist = all_grams

            all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data]
            for i in range(len(self.neg_train_data)):
                neg_train[i].update(all_grams[i])
            featureslist.extend(all_grams)

            # Collapsing, limiting ngrams
            self.features.update(ngrams.top_ngrams(ngrams.collapse_ngrams(
                        featureslist),lim))

        # Creating Index
        self.classifier = self.clsf(restrictFeatures = self.features)
        print "# features: %s" % self.classifier.nfeatures
        
        if self.idf:
            print "Using TF-IDF"
            idf = ngrams.ngrams_to_idf(pos_train + neg_train)
            for i in range(len(pos_train)):
                for j in pos_train[i]:
                    pos_train[i][j] = pos_train[i][j] * idf[j]
            for i in range(len(neg_train)):
                for j in neg_train[i]:
                    neg_train[i][j] = neg_train[i][j] * idf[j]
                            
        # Making classifier
        for i in pos_train:
            self.count += 1
            self.classifier.addFeatureVector(i, 1, binary=self.binary)
        for i in neg_train:
            self.classifier.addFeatureVector(i, -1, binary=self.binary)
        self.classifier.compile()
示例#3
0
import matplotlib.pyplot as plt
from classifier import MaximumEntropyClassifier

TRAIN_SIZE = 800
n = 1

print "Maximum Entropy"
pos = os.listdir("pos")
neg = os.listdir("neg")

ind = Indexes('r',1,TRAIN_SIZE)
print "> determined Indices"
ind.next()

pos_grams = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()]
pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams),16165)
neg_grams = [ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()]
neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams),16165)
print "> collapsed grams"

trainingset = [([k],'pos',v) for (k,v) in pos_collapsed_grams.iteritems()]
trainingset.extend([([k],'neg',v) for (k,v) in neg_collapsed_grams.iteritems()])
m = MaximumEntropyClassifier(trainingset)
print "> created model"

pos_res = []
neg_res = []
pos_tests = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_test_ind()]
pos_results = [m.classify(test) for test in pos_tests]
pos_correct = len([i for i in pos_results if i >= 0.5])
print "Positive: %s of %s, %s accuracy" % (pos_correct,len(pos_tests),(float(pos_correct)/len(pos_tests)))
示例#4
0
n = 1

print "Maximum Entropy"
pos = os.listdir("pos")
neg = os.listdir("neg")

ind = Indexes('r', 1, TRAIN_SIZE)
print "> determined Indices"
ind.next()

pos_grams = [
    ngrams.ngrams(n,
                  open("pos/" + pos[i]).read())
    for i in ind.get_pos_train_ind()
]
pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams),
                                        16165)
neg_grams = [
    ngrams.ngrams(n,
                  open("neg/" + neg[i]).read())
    for i in ind.get_neg_train_ind()
]
neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams),
                                        16165)
print "> collapsed grams"

trainingset = [([k], 'pos', v) for (k, v) in pos_collapsed_grams.iteritems()]
trainingset.extend([([k], 'neg', v)
                    for (k, v) in neg_collapsed_grams.iteritems()])
m = MaximumEntropyClassifier(trainingset)
print "> created model"