예제 #1
0
def get_performance(test_df, X_std, y):
    Xtest = test_df.ix[:, 'x.1':'x.10'].values
    ytest = test_df.ix[:, 'y'].values

    X_std_test = StandardScaler().fit_transform(Xtest)

    lda_model = LDA()
    lda_model.fit(X_std, y)

    qda_model = QDA()
    qda_model.fit(X_std, y)

    knn_model = KNeighborsClassifier(n_neighbors=10)
    knn_model.fit(X_std, y)

    print "KNN SCORE"
    print knn_model.score(X_std_test, ytest)
    print "LDA SCORE"
    print lda_model.score(X_std_test, ytest)
    print "QDA SCORE"
    print qda_model.score(X_std_test, ytest)

    knn_scores_training = []
    knn_scores_test = []

    for i in range(1, 12):
        knn_model = KNeighborsClassifier(n_neighbors=i)
        knn_model.fit(X_std, y)
        knn_scores_training.append(knn_model.score(X_std_test, ytest))
        knn_scores_test.append(knn_model.score(X_std, y))

    plt.plot(range(11), knn_scores_training, 'r--')
    plt.plot(range(11), knn_scores_test, 'b--')
    plt.axis([0, 10, 0.3, 1.1])
    plt.show()
예제 #2
0
def get_QDA(Xtrain, Xtest, Ytrain, Ytest):
    qda = QDA()
    qda.fit(Xtrain,Ytrain)
#    predLabels = qda.predict(Xtest)
#    print("Classification Rate Test QDA: " + str(np.mean(Ytest==predLabels)*100) + " %")
    scores = np.empty((4))
    scores[0] = qda.score(Xtrain,Ytrain)
    scores[1] = qda.score(Xtest,Ytest)
    print('QDA, train: {0:.02f}% '.format(scores[0]*100))
    print('QDA, test: {0:.02f}% '.format(scores[1]*100))
    return qda
예제 #3
0
def get_QDA(Xtrain, Xtest, Ytrain, Ytest):
    qda = QDA()
    qda.fit(Xtrain, Ytrain)
    #    predLabels = qda.predict(Xtest)
    #    print("Classification Rate Test QDA: " + str(np.mean(Ytest==predLabels)*100) + " %")
    scores = np.empty((4))
    scores[0] = qda.score(Xtrain, Ytrain)
    scores[1] = qda.score(Xtest, Ytest)
    print('QDA, train: {0:.02f}% '.format(scores[0] * 100))
    print('QDA, test: {0:.02f}% '.format(scores[1] * 100))
    return qda
예제 #4
0
def get_LDA_performance(test_df, X_std, y):
    X_test = test_df.ix[:, 'x.1':'x.10'].values
    X_std_test = StandardScaler().fit_transform(X_test)
    y_test = test_df.ix[:, 'y'].values

    lda_scores_training = []
    lda_scores_test = []

    qda_scores_training = []
    qda_scores_test = []

    knn_scores_training = []
    knn_scores_test = []

    for d in range(1, 11):
        lda = LDA(n_components=d)
        Xred_lda_training = lda.fit_transform(X_std, y)
        Xred_lda_test = lda.transform(X_std_test)

        lda_model = LDA()
        lda_model.fit(Xred_lda_training, y)

        qda_model = QDA()
        qda_model.fit(Xred_lda_training, y)

        knn_model = KNeighborsClassifier(n_neighbors=10)
        knn_model.fit(Xred_lda_training, y)

        lda_scores_training.append(1 - lda_model.score(Xred_lda_training, y))
        lda_scores_test.append(1 - lda_model.score(Xred_lda_test, y_test))

        qda_scores_training.append(1 - qda_model.score(Xred_lda_training, y))
        qda_scores_test.append(1 - qda_model.score(Xred_lda_test, y_test))

        knn_scores_training.append(1 - knn_model.score(Xred_lda_training, y))
        knn_scores_test.append(1 - knn_model.score(Xred_lda_test, y_test))

    plt.plot(range(10), lda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), lda_scores_test, 'b--', label="Test data")
    plt.title("LDA vs LDA")
    plt.xlabel('k')
    plt.ylabel('Score')
    plt.show()

    plt.plot(range(10), qda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), qda_scores_test, 'b--', label="Test data")
    plt.title("QDA vs LDA")
    plt.show()

    plt.plot(range(10), knn_scores_training, 'r--', label="Train data")
    plt.plot(range(10), knn_scores_test, 'b--', label="Test data")
    plt.title("KNN vs LDA")
    plt.show()
def performQDAClass(X_train, y_train, X_test, y_test):
    """
    QDA Classification
    """
    clf = QDA()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    return accuracy
예제 #6
0
def performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
	"""
	SVM binary classification
	"""
	clf = QDA()
	clf.fit(X_train, y_train)

	accuracy = clf.score(X_test, y_test)
	return accuracy
예제 #7
0
def performQDAClass(X_train, y_train, X_test, y_test):
    """
    Gradient Tree Boosting binary Classification
    """
    clf = QDA()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy
def Call_QDA_Classi(X_train, y_train, X_test, y_test):
    """
    QDA Classification
    """
    clf = QDA()
    """
    print("QDA  Classification ",clf.get_params().keys())
    ['priors', 'reg_param', 'tol', 'store_covariances']
    """
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    return accuracy
예제 #9
0
파일: ch4.py 프로젝트: syting/esl
def table_4_1():
    """Reproduces table 4.1 in ESLii showing the training and test error rates
    for classifying vowels using different classification techniques. The
    sklearn implementation of logistic regression uses OvA instead of a true
    multinomial which likely accounts for the worse results
    """
    vowels_train = eslii.read_vowel_data()
    train_X = vowels_train[vowels_train.columns[1:]]
    train_y = vowels_train['y']
    vowels_test = eslii.read_vowel_data(train=False)
    test_X = vowels_test[vowels_test.columns[1:]]
    test_y = vowels_test['y']

    lda = LDA().fit(train_X, train_y)
    print "Linear discriminant analysis:  {:.2f} {:.2f}".format(
        1 - lda.score(train_X, train_y), 1 - lda.score(test_X, test_y))
    qda = QDA().fit(train_X, train_y)
    print "Quadratic discriminant analysis:  {:.2f} {:.2f}".format(
        1 - qda.score(train_X, train_y), 1 - qda.score(test_X, test_y))
    lr = LogisticRegression(C=1e30).fit(train_X, train_y)
    print "Logistic regression:  {:.2f} {:.2f}".format(
        1 - lr.score(train_X, train_y), 1 - lr.score(test_X, test_y))
예제 #10
0
def performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Quadratic Discriminant Analysis binary Classification
    """
    def replaceTiny(x):
        if (abs(x) < 0.0001):
            x = 0.0001
    
    X_train = X_train.apply(replaceTiny)
    X_test = X_test.apply(replaceTiny)
    
    clf = QDA()
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}-{}.pickle'.format(fout, datetime.now())
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy
예제 #11
0
def performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Quadratic Discriminant Analysis binary Classification
    """
    def replaceTiny(x):
        if (abs(x) < 0.0001):
            x = 0.0001
    
    X_train = X_train.apply(replaceTiny)
    X_test = X_test.apply(replaceTiny)
    
    clf = QDA()
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}-{}.pickle'.format(fout, datetime.now())
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy
예제 #12
0
def trainQDA(XTrain, YTrain, XValid, YValid):
    qda = QDA()
    qda.fit(XTrain, YTrain)    
    print('QDA score : %f' % (qda.score(XValid, YValid)))
예제 #13
0
############ LDA #####################
#Construcción y Fit del modelo LDA
lda_model = LDA()
lda_model.fit(X_std,y)
#Score conjunto de entrenamiento y conjunto de testing.
print lda_model.score(X_std,y)
print lda_model.score(X_std_test,ytest)


############ QDA #####################
#Construcción y Fit del modelo QDA
qda_model = QDA()
qda_model.fit(X_std,y)
#Score conjunto de entrenamiento y conjunto de testing.
print qda_model.score(X_std,y)
print qda_model.score(X_std_test,ytest)

# ############ KNN #####################
# #Construcción y Fit del modelo KNN
# knn_model = KNeighborsClassifier(n_neighbors=10)
# knn_model.fit(X_std,y)
# #Score conjunto de entrenamiento y conjunto de testing.
# print knn_model.score(X_std,y)
# print knn_model.score(X_std_test,ytest)
#
#
# score_training=[]
# score_test=[]
# Lclasses=range(1,len_training_set+1)
# #Comportamiento KNN
예제 #14
0
def main():
    #Define our connection string
    conn_string = "host='localhost' dbname='CRAWL4J' user='******' password='******'"
    # print the connection string we will use to connect
    print "Connecting to database\n    ->%s" % (conn_string)
 
    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
 
    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
 
    # execute our Query
    # X = np.asarray(predictors_list);
    
    my_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count  from arbocrawl_results  where concurrent_name = (%s) "; 
    #url 0, whole_text 1, title 2, h1 3, short_description 4, status_code 5, depth 6, outlinks_size 7, inlinks_size 8, nb_breadcrumbs 9, nb_aggregated_ratings 10, nb_ratings_values 11, nb_prices 12, nb_availabilities 13, nb_reviews 14, nb_reviews_count 15, nb_images 16, nb_search_in_url 17, nb_add_in_text 18, nb_filter_in_text 19, nb_search_in_text 20, nb_guide_achat_in_text 21, nb_product_info_in_text 22, nb_livraison_in_text 23, nb_garanties_in_text 24, nb_produits_similaires_in_text 25, nb_images_text 26, width_average 27, height_average 28, page_rank 29, page_type 30, concurrent_name 31, last_update 32, semantic_hits 33, semantic_title 34, inlinks_semantic 35, inlinks_semantic_count 36  from arbocrawl_results 
    catPred=["PAGE DEPTH AT SITE LEVEL","NUMBER OF OUTGOING LINKS","NUMBER OF INCOMING LINKS","NUMBER OF ITEMTYPE http://data-vocabulary.org/Breadcrumb","NUMBER OF ITEMPROP aggregateRating","NUMBER OF ITEMPROP ratingValue","NUMBER OF ITEMPROP price","NUMBER OF ITEMPROP availability","NUMBER OF ITEMPROP review","NUMBER OF ITEMPROP reviewCount","NUMBER OF ITEMPROP image","NUMBER OF OCCURENCES FOUND IN URL of search + recherche + Recherche + Search","NUMBER OF OCCURENCES FOUND IN PAGE TEXT ajout + ajouter + Ajout + Ajouter","NUMBER OF OCCURENCES FOUND IN PAGE TEXT filtre + facette + Filtre + Facette + filtré + filtrés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Ma recherche + Votre recherche + résultats pour + résultats associés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT guide d""achat + Guide d""achat","NUMBER OF OCCURENCES FOUND IN PAGE TEXT caractéristique + Caractéristique + descriptif + Descriptif +information + Information","NUMBER OF OCCURENCES FOUND IN PAGE TEXT livraison + Livraison + frais de port + Frais de port","NUMBER OF OCCURENCES FOUND IN PAGE TEXT garantie + Garantie +assurance + Assurance","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Produits Similaires + produits similaires + Meilleures Ventes + meilleures ventes +Meilleures ventes + Nouveautés + nouveautés + Nouveauté + nouveauté","NUMBER OF HTML TAG img IN THE PAGE","AVERAGE WIDTH OF HTML TAG img IN THE PAGE","AVERAGE HEIGHT OF HTML TAG img IN THE PAGE"];
    semPred =["PAGE TEXT", "PAGE TITLE", "PAGE H1", "PAGE SHORT DESCRIPTION","TEN BEST TF/IDF HITS FOR THE PAGE","TITLE TF/IDF","PAGE INCOMING LINKS ANCHOR SEMANTIC"];

    print "Executing the following request to fetch data for Cdiscount-maison from the ARBOCRAWL_RESULTS table : " + my_request
    print"Page-type predictors : "+ ', '.join(catPred)
    print"Semantic predictors : " + ', '.join(semPred)
    

    # fetching training data from Cdiscount-maison
    my_filtered_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count  from arbocrawl_results  where page_type !='Unknown' and concurrent_name = (%s) "; 
    cursor.execute(my_filtered_request,("Cdiscount-maison",)); 
    # retrieve the records from the database
    records = cursor.fetchall()
    url_list = [item[0] for item in records];
    semantic_list =  [(item[1],item[2],item[3],item[4],item[33],item[34],item[35]) for item in records];
    predictor_list = [(item[6],item[7],item[8],item[9],item[10],item[11],item[12],item[13],item[14],item[15],item[16],item[17],item[18],item[19],item[20],item[21],item[22],item[23],item[24],item[25],item[26],item[27],item[28]) for item in records];
    output_list    = [item[30] for item in records];
    y=[assign_enumerated_value(output) for output in output_list]
    X= np.asanyarray(predictor_list);
    y= np.asanyarray(y);
    print type(X)
    print X.shape
    print type(y)
    print y.shape
    
    # fetching the data to predict
    my_to_predict_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count  from arbocrawl_results  where concurrent_name != (%s) "; 
    cursor.execute(my_to_predict_request,("Cdiscount-maison",)); 
    # retrieve the records from the database
    records_to_validate = cursor.fetchall()
    url_to_validate_list = [item[0] for item in records_to_validate];
    semantic_to_validate_list =  [(item[1],item[2],item[3],item[4],item[33],item[34],item[35]) for item in records_to_validate];
    predictor_to_validate_list = [(item[6],item[7],item[8],item[9],item[10],item[11],item[12],item[13],item[14],item[15],item[16],item[17],item[18],item[19],item[20],item[21],item[22],item[23],item[24],item[25],item[26],item[27],item[28]) for item in records_to_validate];
    output_to_validate_list    = [item[30] for item in records_to_validate];
    
    Xval= np.asanyarray(predictor_to_validate_list);
    print type(Xval)
    print Xval.shape
    # we must here filter the NaN / Infinity in Xval values
    print np.isnan(Xval)
    Xval = Xval[~np.isnan(Xval)]
    print Xval.shape
 
    # transforming the predictors / rescaling the predictors
    # we don't need to do that
    #X = StandardScaler().fit_transform(X)
    #Xval = StandardScaler().fit_transform(Xval)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
    single_tree = DecisionTreeClassifier(max_depth=5)
    single_tree.fit(X, output_list)
    single_tree.fit(X_train, y_train)
    single_tree_score = single_tree.score(X_test, y_test)
    print "Single tree score " + str(single_tree_score)
    
    random_forest = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    random_forest.fit(X_train, y_train)
    random_forest_score = random_forest.score(X_test, y_test)
    print "Random forest score " + str(random_forest_score)
    
    kneighbors =  KNeighborsClassifier(3)
    kneighbors.fit(X_train, y_train)
    kneighbors_score = kneighbors.score(X_test, y_test)
    print "K-Neighbors score " + str(kneighbors_score)
    
    adaboost =  AdaBoostClassifier()
    adaboost.fit(X_train, y_train)
    adaboost_score = adaboost.score(X_test, y_test)
    print "Ada boost score " + str(adaboost_score)

    gaussian_nb =  GaussianNB()
    gaussian_nb.fit(X_train, y_train)
    gaussian_nb_score = gaussian_nb.score(X_test, y_test)
    print "gaussian mixtures score " + str(gaussian_nb_score)
    
    lda =  LDA()
    lda.fit(X_train, y_train)
    lda_nb_score = lda.score(X_test, y_test)
    print "linear discriminant score " + str(lda_nb_score)
    
    qda =  QDA()
    qda.fit(X_train, y_train)
    qda_nb_score = qda.score(X_test, y_test)
    print "quadratic discriminant score " + str(qda_nb_score)
    
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),


    # we now predict the dataset from the other web sites with the best scoring trained classifier
    y_val_predicted = random_forest.predict(Xval);
    pprint.pprint(y_val_predicted);
예제 #15
0
lda_preds = lda.predict(test_weekly_x)
lda_score = lda.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, lda_preds)

print "\nLDA Results"
print "Confusion Matrix:"
print conf_matrix
print "Fraction of Correct Predictions: " + str(lda_score)

#%% QDA using sklearn
from sklearn.qda import QDA

qda = QDA()
qda.fit(train_weekly_x, train_weekly_y)
qda_preds = qda.predict(test_weekly_x)
qda_score = qda.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, qda_preds)

print "\nQDA Results"
print "Confusion Matrix:"
print conf_matrix
print "Fraction of Correct Predictions: " + str(qda_score)

#%% KNN using sklearn
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_weekly_x, train_weekly_y)
knn_preds = knn.predict(test_weekly_x)
knn_score = knn.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, knn_preds)
예제 #16
0
def main():
    #Define our connection string
    conn_string = "host='localhost' dbname='CRAWL4J' user='******' password='******'"
    # print the connection string we will use to connect
    print "Connecting to database\n    ->%s" % (conn_string)
 
    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
    
    # fetching training data from Cdiscount-maison
    cdiscount_maison_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count  from arbocrawl_results  where page_type !='Unknown' and concurrent_name = 'Cdiscount-maison' "; 
    catPred=["PAGE DEPTH AT SITE LEVEL","NUMBER OF OUTGOING LINKS","NUMBER OF INCOMING LINKS","NUMBER OF ITEMTYPE http://data-vocabulary.org/Breadcrumb","NUMBER OF ITEMPROP aggregateRating","NUMBER OF ITEMPROP ratingValue","NUMBER OF ITEMPROP price","NUMBER OF ITEMPROP availability","NUMBER OF ITEMPROP review","NUMBER OF ITEMPROP reviewCount","NUMBER OF ITEMPROP image","NUMBER OF OCCURENCES FOUND IN URL of search + recherche + Recherche + Search","NUMBER OF OCCURENCES FOUND IN PAGE TEXT ajout + ajouter + Ajout + Ajouter","NUMBER OF OCCURENCES FOUND IN PAGE TEXT filtre + facette + Filtre + Facette + filtré + filtrés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Ma recherche + Votre recherche + résultats pour + résultats associés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT guide d""achat + Guide d""achat","NUMBER OF OCCURENCES FOUND IN PAGE TEXT caractéristique + Caractéristique + descriptif + Descriptif +information + Information","NUMBER OF OCCURENCES FOUND IN PAGE TEXT livraison + Livraison + frais de port + Frais de port","NUMBER OF OCCURENCES FOUND IN PAGE TEXT garantie + Garantie +assurance + Assurance","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Produits Similaires + produits similaires + Meilleures Ventes + meilleures ventes +Meilleures ventes + Nouveautés + nouveautés + Nouveauté + nouveauté","NUMBER OF HTML TAG img IN THE PAGE","AVERAGE WIDTH OF HTML TAG img IN THE PAGE","AVERAGE HEIGHT OF HTML TAG img IN THE PAGE"];
    semPred =["PAGE TEXT", "PAGE TITLE", "PAGE H1", "PAGE SHORT DESCRIPTION","TEN BEST TF/IDF HITS FOR THE PAGE","TITLE TF/IDF","PAGE INCOMING LINKS ANCHOR SEMANTIC"];

    print "Executing the following request to fetch data for Cdiscount-maison from the ARBOCRAWL_RESULTS table : " + cdiscount_maison_request
    print"Page-type predictors : "+ ', '.join(catPred)
    print"Semantic predictors : " + ', '.join(semPred)

    df = pd.read_sql(cdiscount_maison_request, conn)
    
  
    url_list = df.url.values
    semantic_columns = ["url","title","h1","short_description","semantic_hits", "semantic_title", "inlinks_semantic"];
    semantic_predictors = df[list(semantic_columns)].values;
    
    classifying_columns = ["depth", "outlinks_size", "inlinks_size", "nb_breadcrumbs", "nb_aggregated_ratings", "nb_ratings_values", "nb_prices", "nb_availabilities", "nb_reviews", "nb_reviews_count", "nb_images", "nb_search_in_url", "nb_add_in_text", "nb_filter_in_text", "nb_search_in_text", "nb_guide_achat_in_text", "nb_product_info_in_text", "nb_livraison_in_text", "nb_garanties_in_text", "nb_produits_similaires_in_text", "nb_images_text", "width_average","height_average"]
    classifying_predictors = df[list(classifying_columns)].values;
    X= np.asanyarray(classifying_predictors);
    y = df.page_type.values;

    print type(X)
    print X.shape
    print type(y)
    print y.shape
    
    # fetching the data to predict
    to_predict_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count  from arbocrawl_results  where concurrent_name != 'Cdiscount-maison' "; 
    df_to_predict = pd.read_sql(to_predict_request, conn)
    # df_to_predict.dropna()
    # df_to_predict.replace([np.inf, -np.inf], np.nan).dropna(subset=list(classifying_columns), how="all")
    # df_to_predict.dropna(subset=list(classifying_columns), how="all", with_inf=True)
    # indexnan = sum(np.isnan(Xval))
    # indexinfinite = np.isfinite(Xval)
    classifying_predictors_to_predict = df_to_predict[list(classifying_columns)].values;
    Xval= np.asanyarray(classifying_predictors_to_predict);
    print type(Xval)
    print Xval.shape
    
    url_val_list = df_to_predict.url.values
    print type(url_val_list)
    print url_val_list.shape
    
    # we must here filter the NaN / Infinity in Xval values
    #print np.isnan(Xval)
    #Xval = Xval[~np.isnan(Xval)]
    #print Xval.shape
 
    # transforming the predictors / rescaling the predictors
    # we don't need to do that
    #X = StandardScaler().fit_transform(X)
    #Xval = StandardScaler().fit_transform(Xval)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
    single_tree = DecisionTreeClassifier(max_depth=5)
    single_tree.fit(X_train, y_train)
    single_tree_score = single_tree.score(X_test, y_test)
    print "Single tree score " + str(single_tree_score)
    
    random_forest = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    random_forest.fit(X_train, y_train)
    random_forest_score = random_forest.score(X_test, y_test)
    print "Random forest score " + str(random_forest_score)
    
    kneighbors =  KNeighborsClassifier(3)
    kneighbors.fit(X_train, y_train)
    kneighbors_score = kneighbors.score(X_test, y_test)
    print "K-Neighbors score " + str(kneighbors_score)
    
    adaboost =  AdaBoostClassifier()
    adaboost.fit(X_train, y_train)
    adaboost_score = adaboost.score(X_test, y_test)
    print "Ada boost score " + str(adaboost_score)

    gaussian_nb =  GaussianNB()
    gaussian_nb.fit(X_train, y_train)
    gaussian_nb_score = gaussian_nb.score(X_test, y_test)
    print "gaussian mixtures score " + str(gaussian_nb_score)
    
    lda =  LDA()
    lda.fit(X_train, y_train)
    lda_nb_score = lda.score(X_test, y_test)
    print "linear discriminant score " + str(lda_nb_score)
    
    qda =  QDA()
    qda.fit(X_train, y_train)
    qda_nb_score = qda.score(X_test, y_test)
    print "quadratic discriminant score " + str(qda_nb_score)
    
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),


    # we now predict the dataset from the other web sites with the best scoring trained classifier
    y_val_predicted = random_forest.predict(Xval);
    print type(y_val_predicted)
    print y_val_predicted.shape
    
    print type(url_val_list)
    print url_val_list.shape
    
    url_validation_list = url_val_list.tolist()
    y_val_predicted_list = y_val_predicted.tolist()

#    displaying the classified data    
#    pprint.pprint(y_val_predicted_list)
#    pprint.pprint(url_validation_list)
    classified_values = zip(url_validation_list, y_val_predicted_list)
    print "Updating the database with the classification results"
    update_database_with_page_type(conn, classified_values)
    conn.close()
예제 #17
0
파일: qda.py 프로젝트: ash567/ml_contest
import numpy as np
import sklearn
from sklearn.qda import QDA

trainX = np.genfromtxt('train_X.csv', delimiter = ',')
trainY = np.genfromtxt('train_Y.csv')

clf = QDA()
clf.fit(trainX, trainY)

print clf.score(trainX, trainY)

예제 #18
0
for i in range(0,9): 
	probas[i]=probas[i]/528

yhat_apriori = np.argmax(probas) + 1

print "Clase: %d"%yhat_apriori

######## Pregunta (g) ############################################################

lda_model = LDA()
lda_model.fit(X_std,y)
print "Score LDA train: %f"%lda_model.score(X_std,y)
print "Score LDA test: %f"%lda_model.score(X_std_test,ytest)
qda_model = QDA()
qda_model.fit(X_std,y)
print "Score QDA train: %f"%qda_model.score(X_std,y)
print "Score QDA test: %f"%qda_model.score(X_std_test,ytest)
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_std,y)
print "Score KNN train: %f"%knn_model.score(X_std,y)
print "Score KNN test: %f"%knn_model.score(X_std_test,ytest)

values_train = []
values_test = []
for i in range(1, 12):
	knn_model = KNeighborsClassifier(n_neighbors=i)
	knn_model.fit(X_std,y)
	values_train.append(knn_model.score(X_std,y))

for i in range(1, 12):
	knn_model = KNeighborsClassifier(n_neighbors=i)
예제 #19
0
def Call_QDA_Classi(X_train, y_train, X_test, y_test):

    clf = QDA()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    return accuracy
예제 #20
0
# KNN
knn1 = KNeighborsClassifier(n_neighbors=2)
knn1 = knn1.fit(X_train, y_train)
knn1.score(X_train, y_train)
knnpredict = knn1.predict(X_test)
print knnpredict
confusion_matrix(y_test, knnpredict)
print metrics.accuracy_score(y_test, knnpredict)
knn2 = KNeighborsClassifier(n_neighbors=10)
knn2 = knn2.fit(X_train, y_train)
knn2.score(X_train, y_train)
knnpredict1 = knn2.predict(X_test)
print knnpredict1
confusion_matrix(y_test, knnpredict1)
print metrics.accuracy_score(y_test, knnpredict1)

# QDA

qda1 = QDA()
qda1 = qda1.fit(X_train, y_train)
qda1.score(X_train, y_train)
qdapredict = qda1.predict(X_test)
print qdapredict
confusion_matrix(y_test, qdapredict)
print metrics.accuracy_score(y_test, qdapredict)

# Strategies to improve the test accuracy
#We can do algorithm tuning Since machine learning algorithms are driven by parameters, these parameters will influence the outcome of learning.Objective of this is to find optimum value for each parameter to improve accuracy of the model. We should check the impact of each of these parameters on the model.
# By applying ensemble methods such as bagging and boosting we can improve the accuracy of the  model
예제 #21
0
data = np.load("sd.npy")
truth = np.load("truth.npy")

testdata = np.load("sd_test.npy")
testtruth = np.load("truth_test.npy")

print(len(data))

clf = QDA()
clf.fit(data,truth)

output=open("qda.pkl",'wb')

pickle.dump(clf,output)

output.close()

print(clf.score(data,truth))
print(clf.score(testdata,testtruth))

s = np.where(truth == 2)[0]
st = np.where(testtruth == 2)[0]
g = np.where(truth == 1)[0]
gt = np.where(testtruth == 1)[0]
print("Stars")
print(clf.score(data[s],truth[s]))
print(clf.score(testdata[st],testtruth[st]))
print("Galaxies")
print(clf.score(data[g],truth[g]))
print(clf.score(testdata[gt],testtruth[gt]))
    
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.neighbors import KNeighborsClassifier
Xtest = test_df.ix[:,'x.1':'x.10'].values
ytest = test_df.ix[:,'y'].values
X_std_test = StandardScaler().fit_transform(Xtest)

lda_model = LDA()
lda_model.fit(X_std,y)
print lda_model.score(X_std,y)
print lda_model.score(X_std_test,ytest)

qda_model = QDA()
qda_model.fit(X_std,y)
print qda_model.score(X_std,y)
print qda_model.score(X_std_test,ytest)

knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_std,y)
print knn_model.score(X_std,y)
print knn_model.score(X_std_test,ytest)

plt.figure(figsize=(12, 8))
train_scores = []
test_scores = []
for k in range(1,21):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_std,y)
    train_scores += [knn_model.score(X_std,y)]
    test_scores += [knn_model.score(X_std_test,ytest)]