Exemplo n.º 1
0
 def fit(self, X, y):
     # The smaller C, the stronger the regularization.
     # The more regularization, the more sparsity.
     self.transformer_ = LinearSVC(C=1000, penalty="l1",
                                   dual=False, tol=1e-3)
     X = self.transformer_.fit_transform(X, y)
     return LinearSVC.fit(self, X, y)
 def fit(self, X, y):
     # The smaller C, the stronger the regularization.
     # The more regularization, the more sparsity.
     self.transformer_ = LinearSVC(C=1000, penalty="l1",
                                   dual=False, tol=1e-3)
     X = self.transformer_.fit_transform(X, y)
     return LinearSVC.fit(self, X, y)
# Test for 10 rounds using the results from 10 fold cross validations
for i, (train_index, test_index) in enumerate(kf):

    print "run %d" % (i+1)

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train, y_train)
    clf_ridge.fit(X_train, y_train)
    clf_SGD.fit(X_train, y_train)
    clf_lSVC.fit(X_train, y_train)
    clf_SVC.fit(X_train, y_train)

    # get prediction for this fold run
    prob_mNB    = clf_mNB.predict_proba(X_test)
    prob_ridge  = clf_ridge.decision_function(X_test)
    prob_SGD    = clf_SGD.decision_function(X_test)
    prob_lSVC   = clf_lSVC.decision_function(X_test)
    prob_SVC    = clf_SVC.predict_proba(X_test)

    # add prob functions into the z 2d-array
    z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC)
    z = np.append(z, z_temp, axis=0)


# remove the first sub-1d-array of z, due to the creation with 0s
Exemplo n.º 4
0
# # Feature selection for the L1 dataset
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X_L1 = ch2.fit_transform(X_L1, y_L1)
# print "Done in %fs" % (time() - t0)
# print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
# print

# Train L1 classifier
print "Training L1 Classifier..."
t0 = time()
clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
print clf
clf.fit(X_L1, y_L1)
train_time = time() - t0
print "Train time: %0.3fs" % train_time
print

# Train L2 classifiers
print "Training L2 Classifiers..."
t0 = time()

# comment out all linearSVC
# clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2)
# clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)

# v0.2 adjusted the classifiers after test results
Exemplo n.º 5
0
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X_L1 = ch2.fit_transform(X_L1, y_L1)
# print "Done in %fs" % (time() - t0)
# print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
# print


# Train L1 classifier
print "Training L1 Classifier..."
t0 = time()
clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
print clf
clf.fit(X_L1, y_L1)
train_time = time() - t0
print "Train time: %0.3fs" % train_time
print


# Train L2 classifiers
print "Training L2 Classifiers..."
t0 = time()

# comment out all linearSVC
# clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2)
# clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
Exemplo n.º 6
0

# X = X.toarray()
# X_den = X.toarray()
n_samples, n_features = X_train.shape

###############################################################################
# Test classifier on test dataset

# clf = DecisionTreeClassifier(max_depth=14, min_split=5)
# clf = MultinomialNB(alpha=.01)
# clf = KNeighborsClassifier(n_neighbors=19)
# clf = RidgeClassifier(tol=1e-1)
clf = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
# clf = SVC(C=32, gamma=0.0625)
print clf

t0 = time()
clf.fit(X_train, y_train)
print (time()-t0)
t1 = time()
pred = clf.predict(X_test)
print (time()-t1)

pre_score = metrics.precision_score(y_test, pred)
rec_score = metrics.recall_score(y_test, pred)

print "average f1-score:   %0.2f" % (100*((2*pre_score*rec_score)/(pre_score+rec_score)))
print "average f5-score:   %0.2f" % (100*((1.25*pre_score*rec_score)/(0.25*pre_score+rec_score)))
print "average precision:  %0.5f" % pre_score
print "averege recall:     %0.5f" % rec_score
class SpectralFeatureAlignment():

    def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain):
        self._dbDir = dbDir
        self._sourceDomain = sourceDomain
        self._rawDataFolder = rawDataFolder
        self._targetDomain = targetDomain
        self._tableName = sourceDomain + "to" + targetDomain
        self._connection = sqlite.connect(path.join(dbDir,sourceDomain))
        self._cursor = self._connection.cursor()
        self._lsvc = LinearSVC(C=10000)
        self._featuresWithSynsets = {}
        self._featuresWithoutSynsets = {}
        self._allSynsets = []

    def _getFeatures(self, maxDIFeatures=500, minFrequency=5):
        features = []
        self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency])
        features = [a[0] for a in self._cursor.fetchall()]
        self._cursor.execute("SELECT term FROM mostinformatives") 
        mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000])
        features = [feature for feature in features if feature not in mostInformatives]
        return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:])

    def _getSynsets(self, domainIndependentFeatures, minSyn):
	#unigramTagger = UnigramTagger(brown.tagged_sents(simplify_tags=True))
        #bigramTagger = BigramTagger(brown.tagged_sents(simplify_tags=True), backoff=unigramTagger)
	#taggedBigrams = [bigramTagger.tag(feature.split('_')) for feature in domainIndependentFeatures if "_" in feature and "<" not in feature]
        #tmp = ("PRO", "CNJ", "DET", "EX", "MOD", "P", "TO")
        #for x in taggedBigrams:
            #firstWord,firstTag = x[0]
            #secondWord,secondTag = x[1]
            #feature = "_".join((firstWord,secondWord))
            #if firstTag in tmp and secondTag not in tmp:
                #self._featuresWithSynsets[feature] = wn.synsets(secondWord)
            #elif firstTag not in tmp and secondTag in tmp:
                #self._featuresWithSynsets[feature] = wn.synsets(firstWord)


        Bigrams = [feature for feature in domainIndependentFeatures if "_" in feature and "<" not in feature]
        #filterWords = ("a", "and", "are", "be", "has", "have", "i", "is", "it", "of", "the", "to", "will", "had", "as", "my", "that", "was")
        stopwordList = set(stopwords.words("english")) - set(("no", "nor", "not"))
        for bigram in Bigrams:
            firstWord, secondWord = bigram.split("_")
            if firstWord in stopwordList and secondWord in stopwordList:
                pass
            elif firstWord in stopwordList:
                self._featuresWithSynsets[bigram] = wn.synsets(secondWord)
            elif secondWord in stopwordList:
                self._featuresWithSynsets[bigram] = wn.synsets(firstWord)

        self._featuresWithSynsets = {feature:[str(synset) for synset in synsets] for feature,synsets in self._featuresWithSynsets.items() if synsets}
        unigrams = [feature for feature in domainIndependentFeatures if "_" not in feature]
        for unigram in unigrams:
            synsets = wn.synsets(unigram)
            if synsets:
                self._featuresWithSynsets[unigram] = [str(synset) for synset in synsets]

        allSynsets = [synsets for sublist in self._featuresWithSynsets.values() for synsets in sublist]
        allSynsets = set([synset for synset in allSynsets if allSynsets.count(synset) >= minSyn])
        self._featuresWithSynsets = {feature:set(synsets) & allSynsets for feature,synsets in self._featuresWithSynsets.items() if set(synsets) & allSynsets}
        self._featuresWithoutSynsets = sorted(set(domainIndependentFeatures) - set(self._featuresWithSynsets.keys()))
        return sorted(allSynsets)

    def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures):
        domainIndependentFeaturesSet = set(domainIndependentFeatures)
        domainDependentFeaturesSet = set(domainDependentFeatures)
        numSyn = len(self._allSynsets)
        def __parseFile(filePath):
            with open(filePath, "r") as f:
                for review in f:
                        reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()])
                        independentFeatures = reviewFeatures & domainIndependentFeaturesSet
                        dependentFeatures = reviewFeatures & domainDependentFeaturesSet
                        for dependentFeature in dependentFeatures:
                            rowIndex = bisect_left(domainDependentFeatures,dependentFeature)
                            for independentFeature in independentFeatures:
                                if independentFeature in self._featuresWithSynsets:
                                    for synset in self._featuresWithSynsets[independentFeature]:
                                        matrix[rowIndex, bisect_left(self._allSynsets,synset)] += 1
                                else:
                                    matrix[rowIndex, bisect_left(self._featuresWithoutSynsets,independentFeature)+numSyn] += 1
                        
        matrix = np.zeros((len(domainDependentFeatures), len(self._featuresWithoutSynsets)+numSyn))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review"))
        return sparse.coo_matrix(matrix)

    def _createSquareAffinityMatrix(self, cooccurrenceMatrix):
       height = np.size(cooccurrenceMatrix, 0) 
       width = np.size(cooccurrenceMatrix, 1) 
       topMatrix = sparse.coo_matrix((height,height))
       topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix))
       bottomMatrix = sparse.coo_matrix((width,width))
       bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix))
       matrix = sparse.vstack((topMatrix, bottomMatrix))
       return matrix
   
    def _createDiagonalMatrix(self, squareAffinityMatrix):
        rows = range(squareAffinityMatrix.get_shape()[0])
        data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)]
        return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1]))

    def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain):
        numDomainDep = len(domainDependentFeatures)
        numDomainIndep = len(domainIndependentFeatures)
        domainDepSet = set(domainDependentFeatures)
        domainIndepSet = set(domainIndependentFeatures)
        documentVectors = []
        classifications = []
        numSynsets = len(self._allSynsets)
        def __parseFile(filePath):
            with open(filePath,"r") as f:
                for review in f:
                    classification = 1 if "#label#:positive" in review else -1
                    reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel]
                    reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList}
                    reviewFeatures = set(reviewDict.keys())
                    domainDepReviewFeatures = domainDepSet & reviewFeatures
                    domainIndepReviewFeatures = domainIndepSet & reviewFeatures
                    domainDepValues,domainDepIndizes = [],[]
                    domainIndepValues, domainIndepIndizes = [],[]
                    for feature in domainIndepReviewFeatures:
                        if feature in self._featuresWithSynsets:
                            for synset in self._featuresWithSynsets[feature]:
                                domainIndepIndizes.append(bisect_left(self._allSynsets,synset))
                                domainIndepValues.append(1)
                        else:
                            domainIndepIndizes.append(bisect_left(self._featuresWithoutSynsets,feature)+numSynsets)
                            domainIndepValues.append(1)
                            #domainIndepValues.append(reviewDict[feature])
                    for feature in domainDepReviewFeatures:
                        #domainDepValues.append(reviewDict[feature])
                        domainDepValues.append(1)
                        domainDepIndizes.append(bisect_left(domainDependentFeatures,feature))
                    domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)),
                            shape=(1,len(self._featuresWithoutSynsets)+numSynsets))
                    domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep))
                    documentVectors.append((domainIndepVector,domainDepVector))
                    classifications.append(classification)

        __parseFile(path.join(self._rawDataFolder, domain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, domain, "negative.review"))
        return documentVectors,classifications 

    def _trainClassifier(self, trainingVectors, classifications):
        self._lsvc.fit(sparse.vstack(trainingVectors),classifications)

    def _testClassifier(self,testVectors,classifications):
        return self._lsvc.score(sparse.vstack(testVectors),classifications)




    def go(self,K=100, Y=6, DI=500, minFreq=5, minSyn=10):
        print self._sourceDomain + " -> " + self._targetDomain
        domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq)
        numDomainIndep = len(domainIndependentFeatures)
        numDomainDep = len(domainDependentFeatures)
        #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep)
        #print "finding synsets..."
        self._allSynsets = self._getSynsets(domainIndependentFeatures, minSyn)
        print self._featuresWithSynsets
        for k,v in self._featuresWithSynsets.items():
            print str(k) + " : " + str(v)
        if not self._allSynsets:
            return
        #print "creating cooccurrenceMatrix..."
        a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures)
        #print "creating SquareAffinityMatrix..."
        a = self._createSquareAffinityMatrix(a)
        #print "creating DiagonalMatrix..."
        b = self._createDiagonalMatrix(a)
        #print "multiplying..." 
        c = b.dot(a)
        del a
        c = c.dot(b)
        del b
        #print "calculating eigenvalues and eigenvectors"
        eigenValues,eigenVectors = eigsh(c, k=K, which="LA")
        del c
        #print "building document vectors..."
        documentVectorsTraining,classifications = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain)
        documentVectorsTesting,classificatons = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain)
        #print "training and testing..."
        U  = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]]
        U = np.concatenate(U,axis=1)[:numDomainDep]
        U = sparse.csr_matrix(U)
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining]
        trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))]
        self._trainClassifier(trainingVectors,classifications)
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting]
        testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))]
        print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i AND minSyn=%i" % (self._testClassifier(testVectors,classifications)*100,K,DI,Y,minFreq,minSyn)
class SpectralFeatureAlignment():

    def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain):
        self._dbDir = dbDir
        self._sourceDomain = sourceDomain
        self._rawDataFolder = rawDataFolder
        self._targetDomain = targetDomain
        self._tableName = sourceDomain + "to" + targetDomain
        self._connection = sqlite.connect(path.join(dbDir,sourceDomain))
        self._cursor = self._connection.cursor()
        self._lsvc = LinearSVC(C=10000)

    def _getFeatures(self, maxDIFeatures=500, minFrequency=5):
        features = []
        self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency])
        features = [a[0] for a in self._cursor.fetchall()]
        self._cursor.execute("SELECT term FROM mostinformatives") 
        mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000])
        features = [feature for feature in features if feature not in mostInformatives]
        return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:])

    def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures):
        domainIndependentFeaturesSet = set(domainIndependentFeatures)
        domainDependentFeaturesSet = set(domainDependentFeatures)
        def __parseFile(filePath):
            with open(filePath, "r") as f:
                for review in f:
                        reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()])
                        independentFeatures = reviewFeatures & domainIndependentFeaturesSet
                        dependentFeatures = reviewFeatures & domainDependentFeaturesSet
                        for dependentFeature in dependentFeatures:
                            rowIndex = bisect_left(domainDependentFeatures,dependentFeature)
                            for independentFeature in independentFeatures:
                                matrix[rowIndex, bisect_left(domainIndependentFeatures,independentFeature)] += 1
                        
        matrix = np.zeros((len(domainDependentFeatures), len(domainIndependentFeatures)))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review"))
        return sparse.coo_matrix(matrix)

    def _createSquareAffinityMatrix(self, cooccurrenceMatrix):
       height = np.size(cooccurrenceMatrix, 0) 
       width = np.size(cooccurrenceMatrix, 1) 
       topMatrix = sparse.coo_matrix((height,height))
       topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix))
       bottomMatrix = sparse.coo_matrix((width,width))
       bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix))
       matrix = sparse.vstack((topMatrix, bottomMatrix))
       return matrix
   
    def _createDiagonalMatrix(self, squareAffinityMatrix):
        rows = range(squareAffinityMatrix.get_shape()[0])
        data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)]
        return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1]))

    def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain):
        numDomainDep = len(domainDependentFeatures)
        numDomainIndep = len(domainIndependentFeatures)
        domainDepSet = set(domainDependentFeatures)
        domainIndepSet = set(domainIndependentFeatures)
        documentVectors = []
        classifications = []
        def __parseFile(filePath):
            with open(filePath,"r") as f:
                for review in f:
                    classification = 1 if "#label#:positive" in review else -1
                    reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel]
                    reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList}
                    reviewFeatures = set(reviewDict.keys())
                    domainDepReviewFeatures = domainDepSet & reviewFeatures
                    domainIndepReviewFeatures = domainIndepSet & reviewFeatures
                    domainDepValues,domainDepIndizes = [],[]
                    domainIndepValues, domainIndepIndizes = [],[]
                    for feature in domainIndepReviewFeatures:
                        #domainIndepValues.append(reviewDict[feature])
                        domainIndepValues.append(1)
                        domainIndepIndizes.append(bisect_left(domainIndependentFeatures,feature))
                    for feature in domainDepReviewFeatures:
                        #domainDepValues.append(reviewDict[feature])
                        domainDepValues.append(1)
                        domainDepIndizes.append(bisect_left(domainDependentFeatures,feature))
                    domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)),shape=(1,numDomainIndep))
                    domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep))
                    documentVectors.append((domainIndepVector,domainDepVector))
                    classifications.append(classification)

        __parseFile(path.join(self._rawDataFolder, domain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, domain, "negative.review"))
        return documentVectors,classifications 

    def _trainClassifier(self, trainingVectors, classifications):
        self._lsvc.fit(sparse.vstack(trainingVectors),classifications)

    def _testClassifier(self,testVectors,classifications):
        return self._lsvc.score(sparse.vstack(testVectors),classifications)




    def go(self,K=100, Y=6, DI=500, minFreq=5):
        print self._sourceDomain + " -> " + self._targetDomain
        domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq)
        numDomainIndep = len(domainIndependentFeatures)
        numDomainDep = len(domainDependentFeatures)
        #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep)
        #print "creating cooccurrenceMatrix..."
        a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures)
        #print "creating SquareAffinityMatrix..."
        a = self._createSquareAffinityMatrix(a)
        #print "creating DiagonalMatrix..."
        b = self._createDiagonalMatrix(a)
        #print "multiplying..." 
        c = b.dot(a)
        del a
        c = c.dot(b)
        del b
        #print "calculating eigenvalues and eigenvectors"
        eigenValues,eigenVectors = eigsh(c, k=K, which="LA")
        del c
        #print "building document vectors..."
        documentVectorsTraining,classificationsTraining = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain)
        documentVectorsTesting,classificationsTesting = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain)
        #print "training and testing..."
        U  = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]]
        U = np.concatenate(U,axis=1)[:numDomainDep]
        U = sparse.csr_matrix(U)
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining]
        trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))]
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting]
        testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))]
        self._trainClassifier(trainingVectors, classificationsTraining)
        print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i" % (self._testClassifier(testVectors,classificationsTesting)*100,K,DI,Y,minFreq)
Exemplo n.º 9
0
clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)
clf_rdg = RidgeClassifier(tol=1e-1)
clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# Logistic regression requires OneVsRestClassifier which hides
# its methods such as decision_function
# It will require extra implementation efforts to use it as a candidate
# for multilabel classification
# clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1'))
# kNN does not have decision function due to its nature
# clf_knn = KNeighborsClassifier(n_neighbors=13)

# train
clf_nb.fit(X, y)
clf_lsvc.fit(X, y)
clf_rdg.fit(X, y)
clf_svc.fit(X, y)
clf_sgd.fit(X, y)

print "Train time: %0.3fs" % (time() - t0)
print


# # predict by simply apply the classifier
# # this will not use the multi-label threshold
# predicted = clf_rdg.predict(X_new)
# for doc, category in zip(docs_new, predicted):
#     print '%r => %s' % (doc, data_train.target_names[int(category)])
#     print