def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. self.transformer_ = LinearSVC(C=1000, penalty="l1", dual=False, tol=1e-3) X = self.transformer_.fit_transform(X, y) return LinearSVC.fit(self, X, y)
# Test for 10 rounds using the results from 10 fold cross validations for i, (train_index, test_index) in enumerate(kf): print "run %d" % (i+1) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train, y_train) clf_ridge.fit(X_train, y_train) clf_SGD.fit(X_train, y_train) clf_lSVC.fit(X_train, y_train) clf_SVC.fit(X_train, y_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_test) prob_ridge = clf_ridge.decision_function(X_test) prob_SGD = clf_SGD.decision_function(X_test) prob_lSVC = clf_lSVC.decision_function(X_test) prob_SVC = clf_SVC.predict_proba(X_test) # add prob functions into the z 2d-array z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC) z = np.append(z, z_temp, axis=0) # remove the first sub-1d-array of z, due to the creation with 0s
# # Feature selection for the L1 dataset # select_chi2 = 1000 # print ("Extracting %d best features by a chi-squared test" % select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k = select_chi2) # X_L1 = ch2.fit_transform(X_L1, y_L1) # print "Done in %fs" % (time() - t0) # print "L1: n_samples: %d, n_features: %d" % X_L1.shape # print # Train L1 classifier print "Training L1 Classifier..." t0 = time() clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) print clf clf.fit(X_L1, y_L1) train_time = time() - t0 print "Train time: %0.3fs" % train_time print # Train L2 classifiers print "Training L2 Classifiers..." t0 = time() # comment out all linearSVC # clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2) # clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # v0.2 adjusted the classifiers after test results
# select_chi2 = 1000 # print ("Extracting %d best features by a chi-squared test" % select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k = select_chi2) # X_L1 = ch2.fit_transform(X_L1, y_L1) # print "Done in %fs" % (time() - t0) # print "L1: n_samples: %d, n_features: %d" % X_L1.shape # print # Train L1 classifier print "Training L1 Classifier..." t0 = time() clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) print clf clf.fit(X_L1, y_L1) train_time = time() - t0 print "Train time: %0.3fs" % train_time print # Train L2 classifiers print "Training L2 Classifiers..." t0 = time() # comment out all linearSVC # clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2) # clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# X = X.toarray() # X_den = X.toarray() n_samples, n_features = X_train.shape ############################################################################### # Test classifier on test dataset # clf = DecisionTreeClassifier(max_depth=14, min_split=5) # clf = MultinomialNB(alpha=.01) # clf = KNeighborsClassifier(n_neighbors=19) # clf = RidgeClassifier(tol=1e-1) clf = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3) # clf = SVC(C=32, gamma=0.0625) print clf t0 = time() clf.fit(X_train, y_train) print (time()-t0) t1 = time() pred = clf.predict(X_test) print (time()-t1) pre_score = metrics.precision_score(y_test, pred) rec_score = metrics.recall_score(y_test, pred) print "average f1-score: %0.2f" % (100*((2*pre_score*rec_score)/(pre_score+rec_score))) print "average f5-score: %0.2f" % (100*((1.25*pre_score*rec_score)/(0.25*pre_score+rec_score))) print "average precision: %0.5f" % pre_score print "averege recall: %0.5f" % rec_score
class SpectralFeatureAlignment(): def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain): self._dbDir = dbDir self._sourceDomain = sourceDomain self._rawDataFolder = rawDataFolder self._targetDomain = targetDomain self._tableName = sourceDomain + "to" + targetDomain self._connection = sqlite.connect(path.join(dbDir,sourceDomain)) self._cursor = self._connection.cursor() self._lsvc = LinearSVC(C=10000) self._featuresWithSynsets = {} self._featuresWithoutSynsets = {} self._allSynsets = [] def _getFeatures(self, maxDIFeatures=500, minFrequency=5): features = [] self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency]) features = [a[0] for a in self._cursor.fetchall()] self._cursor.execute("SELECT term FROM mostinformatives") mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000]) features = [feature for feature in features if feature not in mostInformatives] return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:]) def _getSynsets(self, domainIndependentFeatures, minSyn): #unigramTagger = UnigramTagger(brown.tagged_sents(simplify_tags=True)) #bigramTagger = BigramTagger(brown.tagged_sents(simplify_tags=True), backoff=unigramTagger) #taggedBigrams = [bigramTagger.tag(feature.split('_')) for feature in domainIndependentFeatures if "_" in feature and "<" not in feature] #tmp = ("PRO", "CNJ", "DET", "EX", "MOD", "P", "TO") #for x in taggedBigrams: #firstWord,firstTag = x[0] #secondWord,secondTag = x[1] #feature = "_".join((firstWord,secondWord)) #if firstTag in tmp and secondTag not in tmp: #self._featuresWithSynsets[feature] = wn.synsets(secondWord) #elif firstTag not in tmp and secondTag in tmp: #self._featuresWithSynsets[feature] = wn.synsets(firstWord) Bigrams = [feature for feature in domainIndependentFeatures if "_" in feature and "<" not in feature] #filterWords = ("a", "and", "are", "be", "has", "have", "i", "is", "it", "of", "the", "to", "will", "had", "as", "my", "that", "was") stopwordList = set(stopwords.words("english")) - set(("no", "nor", "not")) for bigram in Bigrams: firstWord, secondWord = bigram.split("_") if firstWord in stopwordList and secondWord in stopwordList: pass elif firstWord in stopwordList: self._featuresWithSynsets[bigram] = wn.synsets(secondWord) elif secondWord in stopwordList: self._featuresWithSynsets[bigram] = wn.synsets(firstWord) self._featuresWithSynsets = {feature:[str(synset) for synset in synsets] for feature,synsets in self._featuresWithSynsets.items() if synsets} unigrams = [feature for feature in domainIndependentFeatures if "_" not in feature] for unigram in unigrams: synsets = wn.synsets(unigram) if synsets: self._featuresWithSynsets[unigram] = [str(synset) for synset in synsets] allSynsets = [synsets for sublist in self._featuresWithSynsets.values() for synsets in sublist] allSynsets = set([synset for synset in allSynsets if allSynsets.count(synset) >= minSyn]) self._featuresWithSynsets = {feature:set(synsets) & allSynsets for feature,synsets in self._featuresWithSynsets.items() if set(synsets) & allSynsets} self._featuresWithoutSynsets = sorted(set(domainIndependentFeatures) - set(self._featuresWithSynsets.keys())) return sorted(allSynsets) def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures): domainIndependentFeaturesSet = set(domainIndependentFeatures) domainDependentFeaturesSet = set(domainDependentFeatures) numSyn = len(self._allSynsets) def __parseFile(filePath): with open(filePath, "r") as f: for review in f: reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()]) independentFeatures = reviewFeatures & domainIndependentFeaturesSet dependentFeatures = reviewFeatures & domainDependentFeaturesSet for dependentFeature in dependentFeatures: rowIndex = bisect_left(domainDependentFeatures,dependentFeature) for independentFeature in independentFeatures: if independentFeature in self._featuresWithSynsets: for synset in self._featuresWithSynsets[independentFeature]: matrix[rowIndex, bisect_left(self._allSynsets,synset)] += 1 else: matrix[rowIndex, bisect_left(self._featuresWithoutSynsets,independentFeature)+numSyn] += 1 matrix = np.zeros((len(domainDependentFeatures), len(self._featuresWithoutSynsets)+numSyn)) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review")) return sparse.coo_matrix(matrix) def _createSquareAffinityMatrix(self, cooccurrenceMatrix): height = np.size(cooccurrenceMatrix, 0) width = np.size(cooccurrenceMatrix, 1) topMatrix = sparse.coo_matrix((height,height)) topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix)) bottomMatrix = sparse.coo_matrix((width,width)) bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix)) matrix = sparse.vstack((topMatrix, bottomMatrix)) return matrix def _createDiagonalMatrix(self, squareAffinityMatrix): rows = range(squareAffinityMatrix.get_shape()[0]) data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)] return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1])) def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain): numDomainDep = len(domainDependentFeatures) numDomainIndep = len(domainIndependentFeatures) domainDepSet = set(domainDependentFeatures) domainIndepSet = set(domainIndependentFeatures) documentVectors = [] classifications = [] numSynsets = len(self._allSynsets) def __parseFile(filePath): with open(filePath,"r") as f: for review in f: classification = 1 if "#label#:positive" in review else -1 reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel] reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList} reviewFeatures = set(reviewDict.keys()) domainDepReviewFeatures = domainDepSet & reviewFeatures domainIndepReviewFeatures = domainIndepSet & reviewFeatures domainDepValues,domainDepIndizes = [],[] domainIndepValues, domainIndepIndizes = [],[] for feature in domainIndepReviewFeatures: if feature in self._featuresWithSynsets: for synset in self._featuresWithSynsets[feature]: domainIndepIndizes.append(bisect_left(self._allSynsets,synset)) domainIndepValues.append(1) else: domainIndepIndizes.append(bisect_left(self._featuresWithoutSynsets,feature)+numSynsets) domainIndepValues.append(1) #domainIndepValues.append(reviewDict[feature]) for feature in domainDepReviewFeatures: #domainDepValues.append(reviewDict[feature]) domainDepValues.append(1) domainDepIndizes.append(bisect_left(domainDependentFeatures,feature)) domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)), shape=(1,len(self._featuresWithoutSynsets)+numSynsets)) domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep)) documentVectors.append((domainIndepVector,domainDepVector)) classifications.append(classification) __parseFile(path.join(self._rawDataFolder, domain, "positive.review")) __parseFile(path.join(self._rawDataFolder, domain, "negative.review")) return documentVectors,classifications def _trainClassifier(self, trainingVectors, classifications): self._lsvc.fit(sparse.vstack(trainingVectors),classifications) def _testClassifier(self,testVectors,classifications): return self._lsvc.score(sparse.vstack(testVectors),classifications) def go(self,K=100, Y=6, DI=500, minFreq=5, minSyn=10): print self._sourceDomain + " -> " + self._targetDomain domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq) numDomainIndep = len(domainIndependentFeatures) numDomainDep = len(domainDependentFeatures) #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep) #print "finding synsets..." self._allSynsets = self._getSynsets(domainIndependentFeatures, minSyn) print self._featuresWithSynsets for k,v in self._featuresWithSynsets.items(): print str(k) + " : " + str(v) if not self._allSynsets: return #print "creating cooccurrenceMatrix..." a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures) #print "creating SquareAffinityMatrix..." a = self._createSquareAffinityMatrix(a) #print "creating DiagonalMatrix..." b = self._createDiagonalMatrix(a) #print "multiplying..." c = b.dot(a) del a c = c.dot(b) del b #print "calculating eigenvalues and eigenvectors" eigenValues,eigenVectors = eigsh(c, k=K, which="LA") del c #print "building document vectors..." documentVectorsTraining,classifications = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain) documentVectorsTesting,classificatons = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain) #print "training and testing..." U = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]] U = np.concatenate(U,axis=1)[:numDomainDep] U = sparse.csr_matrix(U) clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining] trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))] self._trainClassifier(trainingVectors,classifications) clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting] testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))] print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i AND minSyn=%i" % (self._testClassifier(testVectors,classifications)*100,K,DI,Y,minFreq,minSyn)
class SpectralFeatureAlignment(): def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain): self._dbDir = dbDir self._sourceDomain = sourceDomain self._rawDataFolder = rawDataFolder self._targetDomain = targetDomain self._tableName = sourceDomain + "to" + targetDomain self._connection = sqlite.connect(path.join(dbDir,sourceDomain)) self._cursor = self._connection.cursor() self._lsvc = LinearSVC(C=10000) def _getFeatures(self, maxDIFeatures=500, minFrequency=5): features = [] self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency]) features = [a[0] for a in self._cursor.fetchall()] self._cursor.execute("SELECT term FROM mostinformatives") mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000]) features = [feature for feature in features if feature not in mostInformatives] return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:]) def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures): domainIndependentFeaturesSet = set(domainIndependentFeatures) domainDependentFeaturesSet = set(domainDependentFeatures) def __parseFile(filePath): with open(filePath, "r") as f: for review in f: reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()]) independentFeatures = reviewFeatures & domainIndependentFeaturesSet dependentFeatures = reviewFeatures & domainDependentFeaturesSet for dependentFeature in dependentFeatures: rowIndex = bisect_left(domainDependentFeatures,dependentFeature) for independentFeature in independentFeatures: matrix[rowIndex, bisect_left(domainIndependentFeatures,independentFeature)] += 1 matrix = np.zeros((len(domainDependentFeatures), len(domainIndependentFeatures))) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review")) return sparse.coo_matrix(matrix) def _createSquareAffinityMatrix(self, cooccurrenceMatrix): height = np.size(cooccurrenceMatrix, 0) width = np.size(cooccurrenceMatrix, 1) topMatrix = sparse.coo_matrix((height,height)) topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix)) bottomMatrix = sparse.coo_matrix((width,width)) bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix)) matrix = sparse.vstack((topMatrix, bottomMatrix)) return matrix def _createDiagonalMatrix(self, squareAffinityMatrix): rows = range(squareAffinityMatrix.get_shape()[0]) data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)] return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1])) def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain): numDomainDep = len(domainDependentFeatures) numDomainIndep = len(domainIndependentFeatures) domainDepSet = set(domainDependentFeatures) domainIndepSet = set(domainIndependentFeatures) documentVectors = [] classifications = [] def __parseFile(filePath): with open(filePath,"r") as f: for review in f: classification = 1 if "#label#:positive" in review else -1 reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel] reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList} reviewFeatures = set(reviewDict.keys()) domainDepReviewFeatures = domainDepSet & reviewFeatures domainIndepReviewFeatures = domainIndepSet & reviewFeatures domainDepValues,domainDepIndizes = [],[] domainIndepValues, domainIndepIndizes = [],[] for feature in domainIndepReviewFeatures: #domainIndepValues.append(reviewDict[feature]) domainIndepValues.append(1) domainIndepIndizes.append(bisect_left(domainIndependentFeatures,feature)) for feature in domainDepReviewFeatures: #domainDepValues.append(reviewDict[feature]) domainDepValues.append(1) domainDepIndizes.append(bisect_left(domainDependentFeatures,feature)) domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)),shape=(1,numDomainIndep)) domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep)) documentVectors.append((domainIndepVector,domainDepVector)) classifications.append(classification) __parseFile(path.join(self._rawDataFolder, domain, "positive.review")) __parseFile(path.join(self._rawDataFolder, domain, "negative.review")) return documentVectors,classifications def _trainClassifier(self, trainingVectors, classifications): self._lsvc.fit(sparse.vstack(trainingVectors),classifications) def _testClassifier(self,testVectors,classifications): return self._lsvc.score(sparse.vstack(testVectors),classifications) def go(self,K=100, Y=6, DI=500, minFreq=5): print self._sourceDomain + " -> " + self._targetDomain domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq) numDomainIndep = len(domainIndependentFeatures) numDomainDep = len(domainDependentFeatures) #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep) #print "creating cooccurrenceMatrix..." a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures) #print "creating SquareAffinityMatrix..." a = self._createSquareAffinityMatrix(a) #print "creating DiagonalMatrix..." b = self._createDiagonalMatrix(a) #print "multiplying..." c = b.dot(a) del a c = c.dot(b) del b #print "calculating eigenvalues and eigenvectors" eigenValues,eigenVectors = eigsh(c, k=K, which="LA") del c #print "building document vectors..." documentVectorsTraining,classificationsTraining = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain) documentVectorsTesting,classificationsTesting = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain) #print "training and testing..." U = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]] U = np.concatenate(U,axis=1)[:numDomainDep] U = sparse.csr_matrix(U) clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining] trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))] clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting] testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))] self._trainClassifier(trainingVectors, classificationsTraining) print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i" % (self._testClassifier(testVectors,classificationsTesting)*100,K,DI,Y,minFreq)
clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True) clf_rdg = RidgeClassifier(tol=1e-1) clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") # Logistic regression requires OneVsRestClassifier which hides # its methods such as decision_function # It will require extra implementation efforts to use it as a candidate # for multilabel classification # clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1')) # kNN does not have decision function due to its nature # clf_knn = KNeighborsClassifier(n_neighbors=13) # train clf_nb.fit(X, y) clf_lsvc.fit(X, y) clf_rdg.fit(X, y) clf_svc.fit(X, y) clf_sgd.fit(X, y) print "Train time: %0.3fs" % (time() - t0) print # # predict by simply apply the classifier # # this will not use the multi-label threshold # predicted = clf_rdg.predict(X_new) # for doc, category in zip(docs_new, predicted): # print '%r => %s' % (doc, data_train.target_names[int(category)]) # print