Exemplo n.º 1
0
def classify(features, results, test_features, test_results, C, gamma):
    cli = "%s@%s" %(C, gamma) 
    st = time.time()
    log.info("Classifier begins")
    classifier = SVC(C=C, gamma=gamma, kernel="rbf")
    classifier.fit(features, results)
    st2 = time.time()
    prediction = classifier.predict(test_features)
    log.info("id: %s Training time: %s, Prediction time: %s" %(cli, st2-st, time.time()-st2) )
    error = 0
    for index, value in enumerate(prediction):
        if test_results[index] != value:
            error += 1
    return (error/float(len(test_results))) * 100
Exemplo n.º 2
0
def find_best_svc(**params):
    parameters = {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'kernel': ['poly', 'rbf', 'sigmoid'],
        'gamma': [0.0001, 0.001, 0.01, 0.1, 0.5],
    }
    return GridSearchCV(SVC(**params), parameters)
Exemplo n.º 3
0
def classify(train_file, test_file):
    """
    Train a model and test

    train_file: file that the model is trained on
    test_file: file that is used to test the model
    """
    X_train, y_train = load_svmlight_file(train_file)
    X_test, y_test = load_svmlight_file(test_file, X_train.shape[1])
    # X_train = X_train.todense()
    # X_test = X_test.todense()
    clf = SparseSVC(kernel="linear", C=0.2)
    # clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    print sklearn.metrics.classification_report(y_test, y_predict)
    print sklearn.metrics.confusion_matrix(y_test, y_predict)
# Setup 10 fold cross validation
fold_num = 10
kf = KFold(n_samples, k=fold_num, indices=True)

# set number of neighbors for kNN
n_neighb = 13

# Brute-force implementation
clf_bNB     = BernoulliNB(alpha=.01)
clf_mNB     = MultinomialNB(alpha=.01)
clf_kNN     = KNeighborsClassifier(n_neighbors=n_neighb)
clf_ridge   = RidgeClassifier(tol=1e-1)
clf_SGD     = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")
clf_lSVC    = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_SVC     = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)


###############################################################################
# Stacking
# 
# initialize empty y and z

print 'X_den shape: ', X_den.shape
print 'y shape:     ', y.shape

n_categories = len(set(y))
z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float)
# z = np.zeros( (n_samples, n_categories) , dtype=float)

# Test for 10 rounds using the results from 10 fold cross validations
Exemplo n.º 5
0
# X: feature matrix; y: result array; z_k: prediction result array for k's model
# 

# Setup 10 fold cross validation
fold_num = 10
kf = KFold(n_samples, k=fold_num, indices=True)

# set number of neighbors for kNN
n_neighb = 19

# Brute-force implementation
clf_mNB = MultinomialNB(alpha=.01)
clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb)
clf_ridge = RidgeClassifier(tol=1e-1)
clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
clf_SVC = SVC(C=32, gamma=0.0625)
# clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# empty ndarrays for predication results z_kn
z_mNB = np.array([], dtype=np.int32)
z_kNN = np.array([], dtype=np.int32)
z_ridge = np.array([], dtype=np.int32)
z_lSVC = np.array([], dtype=np.int32)
z_SVC = np.array([], dtype=np.int32)


###############################################################################
# Stacking
# 
# initialize empty y and z
    X_orig = X
    y_orig = y

    # Note: NBs are not working
    # clf = DecisionTreeClassifier(max_depth=16, min_split=5)
    # clf = BernoulliNB(alpha=.1) # used for grading classification
    # clf = MultinomialNB(alpha=.01)
    # clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_split=1, random_state=42)
    # clf = OneVsRestClassifier(LogisticRegression(penalty='l1'))
    # nn_num = math.ceil(n_samples/30)
    # clf = KNeighborsClassifier(n_neighbors=nn_num)
    # clf = RidgeClassifier(tol=1e-1)
    # clf = SGDClassifier(alpha=.0001, n_iter=50, penalty="l1")
    # clf = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3)
    clf = SVC(C=32, gamma=0.0625)
    # print clf

    num_run = 10

    # lists to hold all n*k data
    f1_total = []
    f5_total = []
    acc_total = []
    pre_total = []
    rec_total = []

    # 10 run of Kfold
    for i in range(num_run):

        X, y = shuffle(X_orig, y_orig, random_state=(i+60)) 
Exemplo n.º 7
0
"Other Information We Receive and Store : When you register to use MailChimp, we store 'cookies,' which are strings of code, on your computer. We also use electronic images known as Web beacons. With those cookies, we are aware of and collect information concerning when you visit our Website, when you use MailChimp, your browser type and version, your operating system and platform and other similar information. With Web beacons, we can determine when you open email we send you, and collect other data. You may turn off all cookies that have been placed on your computer by following the instructions on your browser on how to block cookies that have been placed on your computer. However, if you block our cookies it will be more difficult, and maybe impossible, to use the Services",
"EMC strives to keep your personal information accurate. We have implemented technology, management processes and policies to maintain data integrity. We will provide you with access to your information when reasonable, or in accordance with relevant laws, including making reasonable effort to provide you with online access and the opportunity to change your information. To protect your privacy and security, we will take steps to verify your identity before granting access or making changes to your personal information. To access and/or correct information, you can do so online or notify us via the appropriate method below depending on which site is at issue",
"Your information to our service providers. We use service providers who help us to provide you with our services. We give relevant persons working for some of these providers access to your information, but only to the extent necessary for them to perform their services for us. We also implement reasonable contractual and technical protections to ensure the confidentiality of your personal information and data is maintained, used only for the provision of their services to us, and handled in accordance with this privacy policy. Examples of service providers include payment processors, email service providers, and web traffic analytics tools",
"Some Microsoft sites allow you to choose to share your personal information with select Microsoft partners so that they can contact you about their products, services or offers. Other sites, such as MSN instead may give you a separate choice as to whether you wish to receive communications from Microsoft about a partner's particular offering (without transferring your personal information to the third party). See the Communication Preferences section below for more information.",
]

X_new = vectorizer.transform(docs_new)


# Train classifiers
print "Training Classifiers..."
t0 = time()

clf_nb = MultinomialNB()
clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True)
clf_rdg = RidgeClassifier(tol=1e-1)
clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# Logistic regression requires OneVsRestClassifier which hides
# its methods such as decision_function
# It will require extra implementation efforts to use it as a candidate
# for multilabel classification
# clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1'))
# kNN does not have decision function due to its nature
# clf_knn = KNeighborsClassifier(n_neighbors=13)

# train
clf_nb.fit(X, y)
clf_lsvc.fit(X, y)
clf_rdg.fit(X, y)