def runModel(X, y, model_name):
    nFolders = 5
    accs = []
    precs = []
    recalls = []
    F1s = []

    n = X.shape[0]
    for exp in range(0, nFolders):
        print '\n\n============================================================================================\nexperiment', exp
        ### 2.1 split training and testing data
        start = (int)((1 - (exp + 1) * 1.0 / nFolders) * n)
        end = (int)((1 - exp * 1.0 / nFolders) * n)
        #print n, start, end
        X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end)
        print 'Running', model_name
        if model_name == 'SVM':
            ### 2.2 build classifier
            clf = LinearSVC(penalty="l1", dual=False, tol=1e-7)
            clf.fit(X_train, y_train)
            ### 2.3 predict
            y_pred = clf.predict(X_test)
        if model_name == 'SVM_new':
            ### 2.2 build classifier
            clf = svm.SVC(C=1.0, gamma=1.0, class_weight='auto')
            clf.fit(X_train, y_train)
            ### 2.3 predict
            y_pred = clf.predict(X_test)
        elif model_name == 'NaiveBayes':
            clf = GaussianNB()
            clf.fit(X_train.todense(), y_train)
            y_pred = clf.predict(X_test.todense())
        elif model_name == 'LogisticRegression':
            clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
            clf.fit(X_train.toarray(), y_train)
            y_pred = clf.predict(X_test.toarray())
        else:
            raise Exception("The model name is incorrect!!!")
        ### 2.4 eval
        acc, prec, recall, F1 = eval(y_test, y_pred)
        print 'Acc = ', acc
        print 'Precision =', prec
        print 'Recall=', recall
        print 'F1 =', F1
        accs.append(acc)
        precs.append(prec)
        recalls.append(recall)
        F1s.append(F1)

    print '\n\n\n'
    print 'avg Acc = ', sum(accs) / len(accs)
    print 'avg Precision = ', sum(precs) / len(precs)
    print 'avg Recall = ', sum(recalls) / len(recalls)
    print 'avg F1 = ', sum(F1s) / len(F1s)
    return sum(accs) / len(accs), sum(precs) / len(precs), sum(recalls) / len(
        recalls), sum(F1s) / len(F1s)
def runModel(X, y, model_name):
    nFolders = 5
    accs = []
    precs = []
    recalls = []
    F1s = []

    n = X.shape[0]
    for exp in range(0, nFolders):
        print '\n\n============================================================================================\nexperiment' , exp
        ### 2.1 split training and testing data
        start = (int)((1-(exp+1) * 1.0/nFolders)*n)
        end = (int)((1-exp * 1.0/nFolders)*n)
        #print n, start, end
        X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end)
        print 'Running', model_name
        if model_name == 'SVM':
            ### 2.2 build classifier
            clf = LinearSVC(penalty="l1", dual=False, tol=1e-7)
            clf.fit(X_train, y_train)
            ### 2.3 predict
            y_pred = clf.predict(X_test)
        if model_name == 'SVM_new':
            ### 2.2 build classifier
            clf = svm.SVC(C = 1.0, gamma = 1.0, class_weight = 'auto')
            clf.fit(X_train, y_train)
            ### 2.3 predict
            y_pred = clf.predict(X_test)
        elif model_name == 'NaiveBayes':
            clf = GaussianNB()
            clf.fit(X_train.todense(), y_train)
            y_pred = clf.predict(X_test.todense())
        elif model_name == 'LogisticRegression':
            clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
            clf.fit(X_train.toarray(), y_train)
            y_pred = clf.predict(X_test.toarray())
        else:
            raise Exception("The model name is incorrect!!!")
        ### 2.4 eval
        acc, prec, recall, F1 = eval(y_test, y_pred)
        print 'Acc = ', acc;
        print 'Precision =', prec;
        print 'Recall=', recall;
        print 'F1 =',  F1
        accs.append(acc)
        precs.append(prec)
        recalls.append(recall)
        F1s.append(F1)

    print '\n\n\n'
    print 'avg Acc = ', sum(accs)/len(accs)
    print 'avg Precision = ', sum(precs)/len(precs)
    print 'avg Recall = ', sum(recalls)/len(recalls)
    print 'avg F1 = ', sum(F1s)/len(F1s)
    return sum(accs)/len(accs), sum(precs)/len(precs),  sum(recalls)/len(recalls), sum(F1s)/len(F1s)
class Expander_LDA_multiclass(Expander_LDA_cossim):
    """
    take LDA vectors of labelled articles and do a multi-class
    classification for deciding where the LDA of the test text belongs
    """

    def __init__(self, ldaModelAll, expander_type=AcronymExpanderEnum.LDA_multiclass):
        Expander_LDA_cossim.__init__(self, ldaModelAll, expander_type)
        self.classifier = LinearSVC()

    def transform(self, X):
        results = Expander_LDA_cossim.transform(self, X)
        return [self._getDenseVector(item) for item in results]

    def _getDenseVector(self, sparse_vec):
        return sparse2full(sparse_vec, self.ldaModel.num_topics)

    def fit(self, X_train, y_train):
        self.classifier.fit(X_train, y_train)

    def predict(self, X_test, acronym):
        labels = self.classifier.predict(X_test)

        decisions = self.classifier.decision_function(X_test)

        confidences = self._getConfidencesFromDecisionFunction(labels, decisions)

        return labels, confidences
예제 #4
0
파일: linear_svc.py 프로젝트: sreev/lale
class LinearSVCImpl():

    def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight='balanced', verbose=0, random_state=None, max_iter=1000):
        self._hyperparams = {
            'penalty': penalty,
            'loss': loss,
            'dual': dual,
            'tol': tol,
            'C': C,
            'multi_class': multi_class,
            'fit_intercept': fit_intercept,
            'intercept_scaling': intercept_scaling,
            'class_weight': class_weight,
            'verbose': verbose,
            'random_state': random_state,
            'max_iter': max_iter}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
def classification_linear_svm(tweets,
                              train_index,
                              test_index,
                              labels_train,
                              random_state=None):
    """Classifies using SVM as classifier
    """

    #Representation
    tfidf_parser = TfidfVectorizer(tokenizer=tokenize,
                                   lowercase=False,
                                   analyzer='word')
    tweets_train = [tweets[tweet_index] for tweet_index in train_index]
    tweets_test = [tweets[tweet_index] for tweet_index in test_index]

    train_sparse_matrix_features_tfidf = tfidf_parser.fit_transform(
        tweets_train)
    test_sparse_matrix_features_tfidf = tfidf_parser.transform(tweets_test)

    classifier = LinearSVC(multi_class="ovr", random_state=random_state)
    print("Start SVM training")
    classifier = classifier.fit(train_sparse_matrix_features_tfidf,
                                labels_train)
    print("Finish SVM training")
    y_labels = classifier.predict(test_sparse_matrix_features_tfidf)

    return y_labels
def applyModelandfit(tweet_list, tweet_label_list,all_tweet,model_name,filename):
    X, y, tweet_id_list = buildMatrixTrainAndTest(tweet_list, tweet_label_list, all_tweet)
    X_train = X[:len(y),:]
    y_train = y
    X_test =  X[len(y):,:]
    tweet_id_list_test = tweet_id_list[len(y):]
    print "number of training tweets are ", X_train.shape, len(y_train)
    if model_name == 'SVM':
        clf = LinearSVC(penalty="l1", dual=False, tol=1e-7)
        clf.fit(X_train, y_train)
    elif model_name == 'NaiveBayes':
        clf = GaussianNB()
        clf.fit(X_train.todense(), y_train)
    elif model_name == 'LogisticRegression':
        clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
        clf.fit(X_train.toarray(), y_train)
    else:
        raise Exception("The model name is incorrect!!!")

    y_pred = clf.predict(X_test)
    print 'length of predict data is ', len(y_pred)
    with open(RESULT_FOLDER+'/'+filename+'_c.csv','wb') as fp:
        writer = csv.writer(fp, delimiter =",",quoting=csv.QUOTE_MINIMAL)
        for i, tweetid in enumerate(tweet_id_list_test):
            writer.writerow([tweetid, all_tweet[tweetid], y_pred[i]])
예제 #7
0
def runModel(X, y, S_data, model_name):

    f = open('r_' + "_" + model_name + '.txt', 'w')
    auc_score_all = []
    fold = S_data[1]
    Index_gen = S_data[0]
    label = ['0.0', '1.0']
    # note that python does not copy the generator,
    # so when it's in the end of the for loop the generator for S_data is also extruded!
    print 'Running', model_name
    for exp in range(0, fold):
        print "=" * 80, "\n", "experiment =", exp
        # getting one fold of indices from the index generator
        train_index, test_index = Index_gen.next()
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # fitting the model
        # 1 fit a model
        # 2 prediction
        if model_name == 'SVM':
            # LinearSVC take care of the multi class response by using one vs others method
            clf = LinearSVC(random_state=0).fit(X_train, y_train)
            y_pred = clf.predict(X_test)
        elif model_name == 'NaiveBayes':
            clf = GaussianNB()
            clf.fit(X_train.to_dense(), y_train)
            y_pred = clf.predict(X_test.to_dense())
        elif model_name == 'LogisticRegression':
            clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
            clf.fit(X_train.as_matrix(), y_train)
            y_pred = clf.predict(X_test.as_matrix())
        else:
            raise Exception("The model name is incorrect!!!")
        ### 2.4 eval
        auc_score = roc_auc_score(y_test, y_pred, average=None)
        auc_score_all.append(auc_score)

    auc_ave = mean(array(auc_score_all), 0)

    print >> f, model_name, '\n', "=" * 80
    print >> f, 'avg auc = ', auc_ave
예제 #8
0
파일: rbm_probs.py 프로젝트: qyx268/plato
def get_svm_score(w, b_h, dataset):
    """
    Given a trained RBM, get the classification score of a linear SVM trained on the hidden Representation
    :param w: Weights
    :param b_h: Hidden biases
    :param dataset: A Dataset object
    :return: A scalar score
    """
    proj_training_data = sigm(dataset.training_set.input.dot(w)+b_h)
    classifier = LinearSVC()
    classifier.fit(proj_training_data, dataset.training_set.target)
    proj_test_data = sigm(dataset.test_set.input.dot(w)+b_h)
    predicted_labels = classifier.predict(proj_test_data)
    score = percent_correct(dataset.test_set.target, predicted_labels)
    return score
예제 #9
0
def get_svm_score(w, b_h, dataset):
    """
    Given a trained RBM, get the classification score of a linear SVM trained on the hidden Representation
    :param w: Weights
    :param b_h: Hidden biases
    :param dataset: A Dataset object
    :return: A scalar score
    """
    proj_training_data = sigm(dataset.training_set.input.dot(w)+b_h)
    classifier = LinearSVC()
    classifier.fit(proj_training_data, dataset.training_set.target)
    proj_test_data = sigm(dataset.test_set.input.dot(w)+b_h)
    predicted_labels = classifier.predict(proj_test_data)
    score = percent_correct(dataset.test_set.target, predicted_labels)
    return score
예제 #10
0
def single_model_tuning(modelname, fold_nr):
    """
    The thread function that can be used for finding the best model hyperparameters, for a single, non-ensemble model,
    for a fixed preprocessor, this method requires the data to be split in folds first.

    parameters:
    :param str modelname: The name of the model to test.
    :param int fold_nr: The number of the fold.
    :return list<dict> results: A list of dictionaries containing the parameter setting and the mae.
    """
    # Init a best mae so far (for printing purposes)
    best = 10
    try:
        log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.')
        X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr)
    except IOError:
        log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first')
    results = []

    # Tune a model based on the command line argument
    if modelname == 'log':
        par = ParameterGrid({
            'logistic__C': np.logspace(-5.0, 5.0, num=11),
            'logistic__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            logistic = LogisticRegression(solver='sag',
                                          n_jobs=NUM_THEADS,
                                          C=a['logistic__C'],
                                          tol=a['logistic__tol'])
            logistic.fit(X_train, y_train)
            predictions_val = logistic.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'logistic__C': a['logistic__C'],
                'logistic__tol': a['logistic__tol'],
                'mae': mae
            })
    elif modelname == 'ridge':
        par = ParameterGrid({'ridge__alpha': np.logspace(-5.0, 5.0, num=11)})
        for a in list(par):
            ridge = OrdinalRidge(a['ridge__alpha'])
            ridge.fit(X_train, y_train)
            predictions_val = ridge.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({'ridge__alpha': a['ridge__alpha'], 'mae': mae})
    elif modelname == 'svc':
        par = ParameterGrid({
            'svc__C': np.logspace(-5.0, 5.0, num=11),
            'svc__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            svc = LinearSVC(C=a['svc__C'], tol=a['svc__tol'])
            svc.fit(X_train, y_train)
            predictions_val = svc.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'svc__C': a['svc__C'],
                'svc__tol': a['svc__tol'],
                'mae': mae
            })
    elif modelname == 'lad':
        par = ParameterGrid({
            'lad__C': np.logspace(-5.0, 5.0, num=11),
            'lad__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            svr_ = svm.LinearSVR(loss='squared_epsilon_insensitive')
            svr = LAD(svr_)  # use mord for rounding and clipping
            svr.fit(X_train, y_train)
            predictions_val = svr.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'lad__C': a['lad__C'],
                'lad__tol': a['lad__tol'],
                'mae': mae
            })
    elif modelname == 'final':
        # This is the tuning of the final ensemble, with fixing 0 rating predictions
        par = ParameterGrid({
            'logistic_lbfgs__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs__tol':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs_multinom__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs_multinom__tol':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_sag_balanced__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_sag_balanced__tol':
            np.logspace(-5.0, 5.0, num=11)
        })

        ensemble = VotingClassifier(estimators=[
            ('logistic_lbfgs',
             LogisticRegression(solver='lbfgs',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01)),
            ('logistic_lbfgs_multinom',
             LogisticRegression(solver='lbfgs',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01,
                                multi_class='multinomial')),
            ('logistic_sag_balanced',
             LogisticRegression(solver='sag',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01,
                                class_weight='balanced')),
        ],
                                    voting='soft',
                                    weights=[1, 1, 1])

        for a in list(par):
            ensemble.set_params(**a)
            ensemble.fit(X_train, y_train)
            predictions_val = ensemble.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'lbfgs_bal':
        clf = LogisticRegression(solver='lbfgs',
                                 n_jobs=NUM_THEADS,
                                 class_weight='balanced')
        par = ParameterGrid({
            'C': np.logspace(-1.0, 1.0, num=5),
            'tol': np.logspace(-3.0, -1.0, num=3)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'lbfgs_multi':
        clf = LogisticRegression(solver='lbfgs',
                                 n_jobs=NUM_THEADS,
                                 multi_class='multinomial')
        par = ParameterGrid({
            'C': np.logspace(-5.0, 5.0, num=11),
            'tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'sag_bal':
        clf = LogisticRegression(solver='sag',
                                 n_jobs=NUM_THEADS,
                                 class_weight='balanced')
        par = ParameterGrid({
            'C': np.logspace(-5.0, 5.0, num=11),
            'tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'nb':
        clf = MultinomialNB()
        par = ParameterGrid(
            {'alpha': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5]})
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    else:
        print "model name not defined"
        return None
    return results
예제 #11
0
print "tain time: ", round(train_r1 - train_r0, 3), "s"
print "prediction time: ", round(test_r1 - test_r0, 3), "s"
print "#################################"
'''
#SVC lib_linear
print("lib_linear")
clf_lib=LinearSVC()

# training
train_l0 = time()
clf_lib.fit(features_train, labels_train)
train_l1 = time()

# prediction or testing
test_l0 = time()
predict = clf_lib.predict(features_test)
test_l1 = time()

print "accuracy: ", clf_lib.score(features_test, labels_test)
print "#################################"
print "tain time: ", round(train_l1 - train_l0, 3), "s"
print "prediction time: ", round(test_l1 - test_l0, 3), "s"

######################################
text = 'اى هبل'
Ifeatures_train,Ifeatures_test,Ilabels_train=preprocess_input([text])
clf_lib.fit(Ifeatures_train,Ilabels_train)
print ("prediction of ",str(clf_lib.predict(Ifeatures_test))[1])

#print "prediction of ", clf.predict(preprocess_input(text))
# print  str(clf.predict(Ifeatures_test))[1]
예제 #12
0
#test_encode = []
#cont = 0
#while cont < 10:
#    test_encode.append(dataset.letra[cont].split())
#    cont += 1

#teste = label_encoder.transform(test_encode[0])

#Rotinas para alimentar o OneHotEncoder
onehot = OneHotEncoder()

int_encoded_fit = int_encoded_fit.reshape(len(int_encoded_fit), 1)
int_encoded_pred = int_encoded_pred.reshape(len(int_encoded_pred), 1)

letra_fit = onehot.fit_transform(int_encoded_fit)
letra_pred = onehot.transform(int_encoded_pred)

#Utilização do SVM
clf.fit(letra_fit, label_train)

prediction = clf.predict(letra_pred)

print()
print("Recall {}".format(
    recall_score(label_test, prediction, average='weighted')))
print("Precision {}".format(
    precision_score(label_test, prediction, average='weighted')))
print("F1 {}".format(f1_score(label_test, prediction, average='weighted')))
print("Accuracy {}".format(accuracy_score(label_test, prediction)))
예제 #13
0
print "tain time: ", round(train_r1 - train_r0, 3), "s"
print "prediction time: ", round(test_r1 - test_r0, 3), "s"
print "#################################"
'''
#SVC lib_linear
print("lib_linear")
clf_lib = LinearSVC()

# training
train_l0 = time()
clf_lib.fit(features_train, labels_train)
train_l1 = time()

# prediction or testing
test_l0 = time()
predict = clf_lib.predict(features_test)
test_l1 = time()

print "accuracy: ", clf_lib.score(features_test, labels_test)
print "#################################"
print "tain time: ", round(train_l1 - train_l0, 3), "s"
print "prediction time: ", round(test_l1 - test_l0, 3), "s"

######################################
text = 'اى هبل'
Ifeatures_train, Ifeatures_test, Ilabels_train = preprocess_input([text])
clf_lib.fit(Ifeatures_train, Ilabels_train)
print("prediction of ", str(clf_lib.predict(Ifeatures_test))[1])

#print "prediction of ", clf.predict(preprocess_input(text))
# print  str(clf.predict(Ifeatures_test))[1]
예제 #14
0
        
    return docs, t_docs, t_docsCategories


data = readData('hackerrank/documentClassification.txt')
X_train = np.array(data[1])
y_train = np.array(data[2])
X_test = np.array(data[0])
print("Extracting features from the training dataset using a sparse vectorizer")
#vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 2), 
 stop_words='english', 
 strip_accents='unicode', 
 norm='l2')
X_train = vectorizer.fit_transform(X_train)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
#                                 stop_words='english')
#X2_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)
svm_classifier = LinearSVC().fit(X_train, y_train)
maxent_classifier = LogisticRegression().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)
print(y_nb_predicted)
y_nb_predicted = svm_classifier.predict(X_test)
print(y_nb_predicted)
y_nb_predicted = maxent_classifier.predict(X_test)
print(y_nb_predicted)