def fit(self, X, y):
     # The smaller C, the stronger the regularization.
     # The more regularization, the more sparsity.
     self.transformer_ = LinearSVC(C=1000, penalty="l1",
                                   dual=False, tol=1e-3)
     X = self.transformer_.fit_transform(X, y)
     return LinearSVC.fit(self, X, y)
예제 #2
0
 def fit(self, X, y):
     # The smaller C, the stronger the regularization.
     # The more regularization, the more sparsity.
     self.transformer_ = LinearSVC(C=1000, penalty="l1",
                                   dual=False, tol=1e-3)
     X = self.transformer_.fit_transform(X, y)
     return LinearSVC.fit(self, X, y)
예제 #3
0
def test_dense_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
    # simulate iterables
    train_data = iter(data[1:-1])
    test_data = iter([data[0], data[-1]])

    # label junk food as -1, the others as +1
    y = np.ones(len(data))
    y[:6] = -1
    y_train = y[1:-1]
    y_test = np.array([y[0], y[-1]])

    pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())])

    parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')}

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # cross-validation doesn't work if the length of the data is not known,
    # hence use lists instead of iterators
    pred = grid_search.fit(list(train_data), y_train).predict(list(test_data))
    assert_array_equal(pred, y_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accuracy models
    assert_equal(grid_search.best_score, 1.0)
    best_vectorizer = grid_search.best_estimator.named_steps['vect']
    assert_equal(best_vectorizer.analyzer.max_n, 1)
 def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain):
     self._dbDir = dbDir
     self._sourceDomain = sourceDomain
     self._rawDataFolder = rawDataFolder
     self._targetDomain = targetDomain
     self._tableName = sourceDomain + "to" + targetDomain
     self._connection = sqlite.connect(path.join(dbDir,sourceDomain))
     self._cursor = self._connection.cursor()
     self._lsvc = LinearSVC(C=10000)
    def train(labeled_featuresets, C=1e5):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        feat = [featureset for featureset, label in labeled_featuresets]
        feature_vectorizer = MVectorizer.DictsVectorizer()
        X = feature_vectorizer.fit_transform(feat)
        X = Normalizer().fit_transform(X)
        label_set = set( [label for featureset, label in labeled_featuresets] )
        label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] )
        y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets])
        print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]),
        classifier = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-5, C=C, scale_C=True))
        classifier.fit(X,y)
        print "done"

        return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
예제 #6
0
        print metrics.confusion_matrix(y_test, pred)

    print
    return score, train_time, test_time

for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print 80 * '='
    print name
    results = benchmark(clf)

for penalty in ["l2", "l1"]:
    print 80 * '='
    print "%s penalty" % penalty.upper()
    # Train Liblinear model
    liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000,
                                            dual=False, tol=1e-3))

    # Train SGD model
    sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                          penalty=penalty))

# Train SGD with Elastic Net penalty
print 80 * '='
print "Elastic-Net penalty"
sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                      penalty="elasticnet"))

# Train sparse Naive Bayes classifiers
print 80 * '='
print "Naive Bayes"
mnnb_results = benchmark(MultinomialNB(alpha=.01))
예제 #7
0
# # Feature selection for the L1 dataset
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X_L1 = ch2.fit_transform(X_L1, y_L1)
# print "Done in %fs" % (time() - t0)
# print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
# print


# Train L1 classifier
print "Training L1 Classifier..."
t0 = time()
clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
print clf
clf.fit(X_L1, y_L1)
train_time = time() - t0
print "Train time: %0.3fs" % train_time
print


# Train L2 classifiers
print "Training L2 Classifiers..."
t0 = time()

# comment out all linearSVC
# clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2)
# clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
    # X: feature matrix; y: result array; z_k: prediction result array for k's model
    # 

    # Setup 10 fold cross validation
    fold_num = 10
    kf = KFold(n_samples, k=fold_num, indices=True)

    # set number of neighbors for kNN
    n_neighb = 19

    # Brute-force implementation
    clf_bNB     = BernoulliNB(alpha=.01)
    clf_mNB     = MultinomialNB(alpha=.01)
    clf_kNN     = KNeighborsClassifier(n_neighbors=n_neighb)
    clf_ridge   = RidgeClassifier(tol=1e-1)
    clf_lSVC    = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
    clf_SVC     = SVC(C=1000, gamma=0.0625, probability=True)
    # clf_SGD     = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

    ###############################################################################
    # Stacking
    # 
    # initialize empty y and z

    n_categories = len(set(y))
    # z = np.array([[0, 0]], dtype=float)
    z = np.array([[0, 0, 0]], dtype=float)
    # z = np.array([[0, 0, 0, 0]], dtype=float)
    # z = np.zeros( (n_samples, n_categories) , dtype=float)

    # Test for 10 rounds using the results from 10 fold cross validations
예제 #9
0
 def predict(self, X):
     X = self.transformer_.transform(X)
     return LinearSVC.predict(self, X)
예제 #10
0
파일: classify.py 프로젝트: vene/misc-nlp
from preprocess import get_clf, load_data, preprocess_data
from sklearn.metrics import classification_report
from sklearn.cross_validation import KFold, LeaveOneOut
from sklearn.grid_search import GridSearchCV

if __name__ == '__main__':
    filename = 'inf-all-labeled.txt'

    X, y = load_data(filename)
    n = len(X)
    scores = np.empty((5, 2, 2), dtype=np.float)
    best_C = np.empty((5, 2, 2), dtype=np.float)
    for i, ngrams in enumerate((2, 3, 4, 5, 6)):
        for j, suffix in enumerate(('', '$')):
            for k, binarize in enumerate((True, False)):
                print "ngrams=%d, suffix=%s, binarize=%s" % (ngrams, suffix,
                                                             binarize)
                X_new = preprocess_data(X,
                                        n=ngrams,
                                        suffix=suffix,
                                        binarize=binarize)
                grid = GridSearchCV(
                    estimator=LinearSVC(),
                    n_jobs=4,
                    verbose=False,
                    param_grid={'C': (0.01, 0.03, 0.1, 0.3, 1, 1.3)},
                    cv=LeaveOneOut(n, indices=True))
                grid.fit(X_new, y)
                scores[i, j, k] = grid.best_score
                best_C[i, j, k] = grid.best_estimator.C
예제 #11
0
    for i, n in enumerate((2, 3, 4, 5, 6)):
        for j, suffix in enumerate(('', '$')):
            for k, binarize in enumerate((True, False)):
                print "%d-%d-%d out of 411" % (i, j, k)
                X_sg_p, v_sg = preprocess.preprocess_data(X_sg,
                                                          suffix=suffix,
                                                          n=n,
                                                          return_vect=True,
                                                          binarize=binarize)
                X_pl_p, v_pl = preprocess.preprocess_data(X_pl,
                                                          suffix=suffix,
                                                          n=n,
                                                          return_vect=True,
                                                          binarize=binarize)

                grid1 = GridSearchCV(estimator=LinearSVC(),
                                     n_jobs=-1,
                                     verbose=True,
                                     param_grid={'C': np.logspace(-2, 2, 5)},
                                     cv=KFold(len(X_sg), k=10, indices=True))
                grid1.fit(X_sg_p, y_sg)
                scores_sg[i, j, k] = grid1.best_score
                best_C_sg = grid1.best_estimator.C
                clf = grid1.best_estimator

                X_sg_n_p = v_sg.transform(X_sg_n)
                y_sg_n = clf.predict(X_sg_n_p)
                predict_sg[i, j, k] = (y_sg_n == 0).mean()

                grid2 = GridSearchCV(estimator=LinearSVC(),
                                     n_jobs=-1,
예제 #12
0
        return unicode_content.lower()

    def __repr__(self):
        return "LowerCasePreprocessor()"

analyzer1 = CharNGramAnalyzer(
    min_n=1,
    max_n=3,
    preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer1)),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(twenty_train.data,twenty_train.target)

# Predict the outcome on the testing set
y_predicted = clf.predict(doc_test)


# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]
# # Feature selection for the L1 dataset
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X_L1 = ch2.fit_transform(X_L1, y_L1)
# print "Done in %fs" % (time() - t0)
# print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
# print


# Train L1 classifier
print "Training L1 Classifier..."
t0 = time()
clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
print clf
clf.fit(X_L1, y_L1)
train_time = time() - t0
print "Train time: %0.3fs" % train_time
print


# Train L2 classifiers
print "Training L2 Classifiers..."
t0 = time()

# comment out all linearSVC
# clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2)
# clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
예제 #14
0
"Like most web-based services, Ning automatically receives and records information on our server logs from your browser when you use the Ning Platform. We may use a variety of methods, including clear GIFs (also known as web beacons), and cookies to collect this information. The information that we collect with these automated methods may include, for example, your IP address, Ning cookie information, a unique device or user ID, browser type, system type, the content and pages that you access on the Ning Platform, and the referring URL (i.e., the page from which you navigated to the Ning Platform).",
"Other Information We Receive and Store : When you register to use MailChimp, we store 'cookies,' which are strings of code, on your computer. We also use electronic images known as Web beacons. With those cookies, we are aware of and collect information concerning when you visit our Website, when you use MailChimp, your browser type and version, your operating system and platform and other similar information. With Web beacons, we can determine when you open email we send you, and collect other data. You may turn off all cookies that have been placed on your computer by following the instructions on your browser on how to block cookies that have been placed on your computer. However, if you block our cookies it will be more difficult, and maybe impossible, to use the Services",
"EMC strives to keep your personal information accurate. We have implemented technology, management processes and policies to maintain data integrity. We will provide you with access to your information when reasonable, or in accordance with relevant laws, including making reasonable effort to provide you with online access and the opportunity to change your information. To protect your privacy and security, we will take steps to verify your identity before granting access or making changes to your personal information. To access and/or correct information, you can do so online or notify us via the appropriate method below depending on which site is at issue",
"Your information to our service providers. We use service providers who help us to provide you with our services. We give relevant persons working for some of these providers access to your information, but only to the extent necessary for them to perform their services for us. We also implement reasonable contractual and technical protections to ensure the confidentiality of your personal information and data is maintained, used only for the provision of their services to us, and handled in accordance with this privacy policy. Examples of service providers include payment processors, email service providers, and web traffic analytics tools",
"Some Microsoft sites allow you to choose to share your personal information with select Microsoft partners so that they can contact you about their products, services or offers. Other sites, such as MSN instead may give you a separate choice as to whether you wish to receive communications from Microsoft about a partner's particular offering (without transferring your personal information to the third party). See the Communication Preferences section below for more information.",
]

X_new = vectorizer.transform(docs_new)


# Train classifiers
print "Training Classifiers..."
t0 = time()

clf_nb = MultinomialNB()
clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)
clf_rdg = RidgeClassifier(tol=1e-1)
clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# Logistic regression requires OneVsRestClassifier which hides
# its methods such as decision_function
# It will require extra implementation efforts to use it as a candidate
# for multilabel classification
# clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1'))
# kNN does not have decision function due to its nature
# clf_knn = KNeighborsClassifier(n_neighbors=13)

# train
clf_nb.fit(X, y)
clf_lsvc.fit(X, y)
class SpectralFeatureAlignment():

    def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain):
        self._dbDir = dbDir
        self._sourceDomain = sourceDomain
        self._rawDataFolder = rawDataFolder
        self._targetDomain = targetDomain
        self._tableName = sourceDomain + "to" + targetDomain
        self._connection = sqlite.connect(path.join(dbDir,sourceDomain))
        self._cursor = self._connection.cursor()
        self._lsvc = LinearSVC(C=10000)

    def _getFeatures(self, maxDIFeatures=500, minFrequency=5):
        features = []
        self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency])
        features = [a[0] for a in self._cursor.fetchall()]
        self._cursor.execute("SELECT term FROM mostinformatives") 
        mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000])
        features = [feature for feature in features if feature not in mostInformatives]
        return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:])

    def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures):
        domainIndependentFeaturesSet = set(domainIndependentFeatures)
        domainDependentFeaturesSet = set(domainDependentFeatures)
        def __parseFile(filePath):
            with open(filePath, "r") as f:
                for review in f:
                        reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()])
                        independentFeatures = reviewFeatures & domainIndependentFeaturesSet
                        dependentFeatures = reviewFeatures & domainDependentFeaturesSet
                        for dependentFeature in dependentFeatures:
                            rowIndex = bisect_left(domainDependentFeatures,dependentFeature)
                            for independentFeature in independentFeatures:
                                matrix[rowIndex, bisect_left(domainIndependentFeatures,independentFeature)] += 1
                        
        matrix = np.zeros((len(domainDependentFeatures), len(domainIndependentFeatures)))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review"))
        return sparse.coo_matrix(matrix)

    def _createSquareAffinityMatrix(self, cooccurrenceMatrix):
       height = np.size(cooccurrenceMatrix, 0) 
       width = np.size(cooccurrenceMatrix, 1) 
       topMatrix = sparse.coo_matrix((height,height))
       topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix))
       bottomMatrix = sparse.coo_matrix((width,width))
       bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix))
       matrix = sparse.vstack((topMatrix, bottomMatrix))
       return matrix
   
    def _createDiagonalMatrix(self, squareAffinityMatrix):
        rows = range(squareAffinityMatrix.get_shape()[0])
        data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)]
        return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1]))

    def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain):
        numDomainDep = len(domainDependentFeatures)
        numDomainIndep = len(domainIndependentFeatures)
        domainDepSet = set(domainDependentFeatures)
        domainIndepSet = set(domainIndependentFeatures)
        documentVectors = []
        classifications = []
        def __parseFile(filePath):
            with open(filePath,"r") as f:
                for review in f:
                    classification = 1 if "#label#:positive" in review else -1
                    reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel]
                    reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList}
                    reviewFeatures = set(reviewDict.keys())
                    domainDepReviewFeatures = domainDepSet & reviewFeatures
                    domainIndepReviewFeatures = domainIndepSet & reviewFeatures
                    domainDepValues,domainDepIndizes = [],[]
                    domainIndepValues, domainIndepIndizes = [],[]
                    for feature in domainIndepReviewFeatures:
                        #domainIndepValues.append(reviewDict[feature])
                        domainIndepValues.append(1)
                        domainIndepIndizes.append(bisect_left(domainIndependentFeatures,feature))
                    for feature in domainDepReviewFeatures:
                        #domainDepValues.append(reviewDict[feature])
                        domainDepValues.append(1)
                        domainDepIndizes.append(bisect_left(domainDependentFeatures,feature))
                    domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)),shape=(1,numDomainIndep))
                    domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep))
                    documentVectors.append((domainIndepVector,domainDepVector))
                    classifications.append(classification)

        __parseFile(path.join(self._rawDataFolder, domain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, domain, "negative.review"))
        return documentVectors,classifications 

    def _trainClassifier(self, trainingVectors, classifications):
        self._lsvc.fit(sparse.vstack(trainingVectors),classifications)

    def _testClassifier(self,testVectors,classifications):
        return self._lsvc.score(sparse.vstack(testVectors),classifications)




    def go(self,K=100, Y=6, DI=500, minFreq=5):
        print self._sourceDomain + " -> " + self._targetDomain
        domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq)
        numDomainIndep = len(domainIndependentFeatures)
        numDomainDep = len(domainDependentFeatures)
        #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep)
        #print "creating cooccurrenceMatrix..."
        a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures)
        #print "creating SquareAffinityMatrix..."
        a = self._createSquareAffinityMatrix(a)
        #print "creating DiagonalMatrix..."
        b = self._createDiagonalMatrix(a)
        #print "multiplying..." 
        c = b.dot(a)
        del a
        c = c.dot(b)
        del b
        #print "calculating eigenvalues and eigenvectors"
        eigenValues,eigenVectors = eigsh(c, k=K, which="LA")
        del c
        #print "building document vectors..."
        documentVectorsTraining,classificationsTraining = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain)
        documentVectorsTesting,classificationsTesting = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain)
        #print "training and testing..."
        U  = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]]
        U = np.concatenate(U,axis=1)[:numDomainDep]
        U = sparse.csr_matrix(U)
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining]
        trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))]
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting]
        testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))]
        self._trainClassifier(trainingVectors, classificationsTraining)
        print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i" % (self._testClassifier(testVectors,classificationsTesting)*100,K,DI,Y,minFreq)
예제 #16
0
print

# # Feature selection for the L1 dataset
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X_L1 = ch2.fit_transform(X_L1, y_L1)
# print "Done in %fs" % (time() - t0)
# print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
# print

# Train L1 classifier
print "Training L1 Classifier..."
t0 = time()
clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
print clf
clf.fit(X_L1, y_L1)
train_time = time() - t0
print "Train time: %0.3fs" % train_time
print

# Train L2 classifiers
print "Training L2 Classifiers..."
t0 = time()

# comment out all linearSVC
# clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2)
# clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
예제 #17
0
def find_best_lsvc(**params):
    parameters = {'C': [0.01, 0.1, 1, 10, 100, 1000]}
    return GridSearchCV(LinearSVC(**params), parameters)
예제 #18
0
# N: number for training examples; K: number of models in level 0
# X: feature matrix; y: result array; z_k: prediction result array for k's model
# 

# Setup 10 fold cross validation
fold_num = 10
kf = KFold(n_samples, k=fold_num, indices=True)

# set number of neighbors for kNN
n_neighb = 19

# Brute-force implementation
clf_mNB = MultinomialNB(alpha=.01)
clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb)
clf_ridge = RidgeClassifier(tol=1e-1)
clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
clf_SVC = SVC(C=32, gamma=0.0625)
# clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# empty ndarrays for predication results z_kn
z_mNB = np.array([], dtype=np.int32)
z_kNN = np.array([], dtype=np.int32)
z_ridge = np.array([], dtype=np.int32)
z_lSVC = np.array([], dtype=np.int32)
z_SVC = np.array([], dtype=np.int32)


###############################################################################
# Stacking
# 
# initialize empty y and z
class SpectralFeatureAlignment():

    def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain):
        self._dbDir = dbDir
        self._sourceDomain = sourceDomain
        self._rawDataFolder = rawDataFolder
        self._targetDomain = targetDomain
        self._tableName = sourceDomain + "to" + targetDomain
        self._connection = sqlite.connect(path.join(dbDir,sourceDomain))
        self._cursor = self._connection.cursor()
        self._lsvc = LinearSVC(C=10000)
        self._featuresWithSynsets = {}
        self._featuresWithoutSynsets = {}
        self._allSynsets = []

    def _getFeatures(self, maxDIFeatures=500, minFrequency=5):
        features = []
        self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency])
        features = [a[0] for a in self._cursor.fetchall()]
        self._cursor.execute("SELECT term FROM mostinformatives") 
        mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000])
        features = [feature for feature in features if feature not in mostInformatives]
        return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:])

    def _getSynsets(self, domainIndependentFeatures, minSyn):
	#unigramTagger = UnigramTagger(brown.tagged_sents(simplify_tags=True))
        #bigramTagger = BigramTagger(brown.tagged_sents(simplify_tags=True), backoff=unigramTagger)
	#taggedBigrams = [bigramTagger.tag(feature.split('_')) for feature in domainIndependentFeatures if "_" in feature and "<" not in feature]
        #tmp = ("PRO", "CNJ", "DET", "EX", "MOD", "P", "TO")
        #for x in taggedBigrams:
            #firstWord,firstTag = x[0]
            #secondWord,secondTag = x[1]
            #feature = "_".join((firstWord,secondWord))
            #if firstTag in tmp and secondTag not in tmp:
                #self._featuresWithSynsets[feature] = wn.synsets(secondWord)
            #elif firstTag not in tmp and secondTag in tmp:
                #self._featuresWithSynsets[feature] = wn.synsets(firstWord)


        Bigrams = [feature for feature in domainIndependentFeatures if "_" in feature and "<" not in feature]
        #filterWords = ("a", "and", "are", "be", "has", "have", "i", "is", "it", "of", "the", "to", "will", "had", "as", "my", "that", "was")
        stopwordList = set(stopwords.words("english")) - set(("no", "nor", "not"))
        for bigram in Bigrams:
            firstWord, secondWord = bigram.split("_")
            if firstWord in stopwordList and secondWord in stopwordList:
                pass
            elif firstWord in stopwordList:
                self._featuresWithSynsets[bigram] = wn.synsets(secondWord)
            elif secondWord in stopwordList:
                self._featuresWithSynsets[bigram] = wn.synsets(firstWord)

        self._featuresWithSynsets = {feature:[str(synset) for synset in synsets] for feature,synsets in self._featuresWithSynsets.items() if synsets}
        unigrams = [feature for feature in domainIndependentFeatures if "_" not in feature]
        for unigram in unigrams:
            synsets = wn.synsets(unigram)
            if synsets:
                self._featuresWithSynsets[unigram] = [str(synset) for synset in synsets]

        allSynsets = [synsets for sublist in self._featuresWithSynsets.values() for synsets in sublist]
        allSynsets = set([synset for synset in allSynsets if allSynsets.count(synset) >= minSyn])
        self._featuresWithSynsets = {feature:set(synsets) & allSynsets for feature,synsets in self._featuresWithSynsets.items() if set(synsets) & allSynsets}
        self._featuresWithoutSynsets = sorted(set(domainIndependentFeatures) - set(self._featuresWithSynsets.keys()))
        return sorted(allSynsets)

    def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures):
        domainIndependentFeaturesSet = set(domainIndependentFeatures)
        domainDependentFeaturesSet = set(domainDependentFeatures)
        numSyn = len(self._allSynsets)
        def __parseFile(filePath):
            with open(filePath, "r") as f:
                for review in f:
                        reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()])
                        independentFeatures = reviewFeatures & domainIndependentFeaturesSet
                        dependentFeatures = reviewFeatures & domainDependentFeaturesSet
                        for dependentFeature in dependentFeatures:
                            rowIndex = bisect_left(domainDependentFeatures,dependentFeature)
                            for independentFeature in independentFeatures:
                                if independentFeature in self._featuresWithSynsets:
                                    for synset in self._featuresWithSynsets[independentFeature]:
                                        matrix[rowIndex, bisect_left(self._allSynsets,synset)] += 1
                                else:
                                    matrix[rowIndex, bisect_left(self._featuresWithoutSynsets,independentFeature)+numSyn] += 1
                        
        matrix = np.zeros((len(domainDependentFeatures), len(self._featuresWithoutSynsets)+numSyn))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review"))
        return sparse.coo_matrix(matrix)

    def _createSquareAffinityMatrix(self, cooccurrenceMatrix):
       height = np.size(cooccurrenceMatrix, 0) 
       width = np.size(cooccurrenceMatrix, 1) 
       topMatrix = sparse.coo_matrix((height,height))
       topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix))
       bottomMatrix = sparse.coo_matrix((width,width))
       bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix))
       matrix = sparse.vstack((topMatrix, bottomMatrix))
       return matrix
   
    def _createDiagonalMatrix(self, squareAffinityMatrix):
        rows = range(squareAffinityMatrix.get_shape()[0])
        data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)]
        return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1]))

    def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain):
        numDomainDep = len(domainDependentFeatures)
        numDomainIndep = len(domainIndependentFeatures)
        domainDepSet = set(domainDependentFeatures)
        domainIndepSet = set(domainIndependentFeatures)
        documentVectors = []
        classifications = []
        numSynsets = len(self._allSynsets)
        def __parseFile(filePath):
            with open(filePath,"r") as f:
                for review in f:
                    classification = 1 if "#label#:positive" in review else -1
                    reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel]
                    reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList}
                    reviewFeatures = set(reviewDict.keys())
                    domainDepReviewFeatures = domainDepSet & reviewFeatures
                    domainIndepReviewFeatures = domainIndepSet & reviewFeatures
                    domainDepValues,domainDepIndizes = [],[]
                    domainIndepValues, domainIndepIndizes = [],[]
                    for feature in domainIndepReviewFeatures:
                        if feature in self._featuresWithSynsets:
                            for synset in self._featuresWithSynsets[feature]:
                                domainIndepIndizes.append(bisect_left(self._allSynsets,synset))
                                domainIndepValues.append(1)
                        else:
                            domainIndepIndizes.append(bisect_left(self._featuresWithoutSynsets,feature)+numSynsets)
                            domainIndepValues.append(1)
                            #domainIndepValues.append(reviewDict[feature])
                    for feature in domainDepReviewFeatures:
                        #domainDepValues.append(reviewDict[feature])
                        domainDepValues.append(1)
                        domainDepIndizes.append(bisect_left(domainDependentFeatures,feature))
                    domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)),
                            shape=(1,len(self._featuresWithoutSynsets)+numSynsets))
                    domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep))
                    documentVectors.append((domainIndepVector,domainDepVector))
                    classifications.append(classification)

        __parseFile(path.join(self._rawDataFolder, domain, "positive.review"))
        __parseFile(path.join(self._rawDataFolder, domain, "negative.review"))
        return documentVectors,classifications 

    def _trainClassifier(self, trainingVectors, classifications):
        self._lsvc.fit(sparse.vstack(trainingVectors),classifications)

    def _testClassifier(self,testVectors,classifications):
        return self._lsvc.score(sparse.vstack(testVectors),classifications)




    def go(self,K=100, Y=6, DI=500, minFreq=5, minSyn=10):
        print self._sourceDomain + " -> " + self._targetDomain
        domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq)
        numDomainIndep = len(domainIndependentFeatures)
        numDomainDep = len(domainDependentFeatures)
        #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep)
        #print "finding synsets..."
        self._allSynsets = self._getSynsets(domainIndependentFeatures, minSyn)
        print self._featuresWithSynsets
        for k,v in self._featuresWithSynsets.items():
            print str(k) + " : " + str(v)
        if not self._allSynsets:
            return
        #print "creating cooccurrenceMatrix..."
        a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures)
        #print "creating SquareAffinityMatrix..."
        a = self._createSquareAffinityMatrix(a)
        #print "creating DiagonalMatrix..."
        b = self._createDiagonalMatrix(a)
        #print "multiplying..." 
        c = b.dot(a)
        del a
        c = c.dot(b)
        del b
        #print "calculating eigenvalues and eigenvectors"
        eigenValues,eigenVectors = eigsh(c, k=K, which="LA")
        del c
        #print "building document vectors..."
        documentVectorsTraining,classifications = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain)
        documentVectorsTesting,classificatons = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain)
        #print "training and testing..."
        U  = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]]
        U = np.concatenate(U,axis=1)[:numDomainDep]
        U = sparse.csr_matrix(U)
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining]
        trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))]
        self._trainClassifier(trainingVectors,classifications)
        clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting]
        testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))]
        print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i AND minSyn=%i" % (self._testClassifier(testVectors,classifications)*100,K,DI,Y,minFreq,minSyn)
예제 #20
0
    print
    return score, train_time, test_time


for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print 80 * '='
    print name
    results = benchmark(clf)

for penalty in ["l2", "l1"]:
    print 80 * '='
    print "%s penalty" % penalty.upper()
    # Train Liblinear model
    liblinear_results = benchmark(
        LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3))

    # Train SGD model
    sgd_results = benchmark(
        SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))

# Train SGD with Elastic Net penalty
print 80 * '='
print "Elastic-Net penalty"
sgd_results = benchmark(
    SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))

# Train sparse Naive Bayes classifiers
print 80 * '='
print "Naive Bayes"
mnnb_results = benchmark(MultinomialNB(alpha=.01))
예제 #21
0
X_train = X_train.toarray()
X_test = X_test.toarray()


# X = X.toarray()
# X_den = X.toarray()
n_samples, n_features = X_train.shape

###############################################################################
# Test classifier on test dataset

# clf = DecisionTreeClassifier(max_depth=14, min_split=5)
# clf = MultinomialNB(alpha=.01)
# clf = KNeighborsClassifier(n_neighbors=19)
# clf = RidgeClassifier(tol=1e-1)
clf = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
# clf = SVC(C=32, gamma=0.0625)
print clf

t0 = time()
clf.fit(X_train, y_train)
print (time()-t0)
t1 = time()
pred = clf.predict(X_test)
print (time()-t1)

pre_score = metrics.precision_score(y_test, pred)
rec_score = metrics.recall_score(y_test, pred)

print "average f1-score:   %0.2f" % (100*((2*pre_score*rec_score)/(pre_score+rec_score)))
print "average f5-score:   %0.2f" % (100*((1.25*pre_score*rec_score)/(0.25*pre_score+rec_score)))
# 

# Setup 10 fold cross validation
fold_num = 10
kf = KFold(n_samples, k=fold_num, indices=True)

# set number of neighbors for kNN
n_neighb = 13

# Brute-force implementation
clf_bNB     = BernoulliNB(alpha=.01)
clf_mNB     = MultinomialNB(alpha=.01)
clf_kNN     = KNeighborsClassifier(n_neighbors=n_neighb)
clf_ridge   = RidgeClassifier(tol=1e-1)
clf_SGD     = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")
clf_lSVC    = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_SVC     = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)


###############################################################################
# Stacking
# 
# initialize empty y and z

print 'X_den shape: ', X_den.shape
print 'y shape:     ', y.shape

n_categories = len(set(y))
z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float)
# z = np.zeros( (n_samples, n_categories) , dtype=float)
예제 #23
0
categories = ['HUM', 'LOC', 'NUM', 'ENTY', 'DESC', 'ABBR']

train = load_files('coarse/',
                   categories=categories,
                   shuffle=True,
                   random_state=42)
# save train pickle
filehandler = open('pickle_training_coarse.pkl', 'wb')
pickle.dump(train, filehandler)
filehandler.close()

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

_ = text_clf.fit(train.data, train.target)

# save text_clf pickle
filehandler = open('pickle_clf_coarse.pkl', 'wb')
pickle.dump(text_clf, filehandler)
filehandler.close()

#new = ['Where is the Amazon river located?',
#       'Where can I get a good sandwhich',
#       'In what state was Columbus born?',
#       'What is the best cheese?']

text = """
 def predict(self, X):
     X = self.transformer_.transform(X)
     return LinearSVC.predict(self, X)
예제 #25
0
# split ~140k into ~100k training and ~40k test
ff_train, ff_val = split_dataframe(test_ff)

print("Training...")

t1 = time()

vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(ff_train["TitlePlusBody"])
tfidf_transformer = TfidfTransformer(use_idf=False)

# 98190x285052
train_tfidf_table = tfidf_transformer.fit_transform(train_counts)

clf = LinearSVC().fit(train_tfidf_table, ff_train["OpenStatus"])

print("Testing...")

test_counts = vectorizer.transform(ff_val["TitlePlusBody"])
test_tfidf_table = tfidf_transformer.transform(test_counts)

predict = clf.predict(test_tfidf_table)
print("np.mean: %f" % (np.mean(predict == ff_val["OpenStatus"])))

linear_decisions = clf.decision_function(test_tfidf_table)
predicted_probs = (1 / (1 + np.exp(- linear_decisions))) ** 3.5
print("MCLL: %f" % (mcll(predicted_probs, ff_val["OpenStatus"].values)))

t2 = time()
print("done in %d seconds" % (t2 - t1))