Exemplo n.º 1
def test_main():
    directory = 'ds2'
    directory = 'dataset'
    directory = 'ds3'
    # load the dataset from disk
    files = sklearn.datasets.load_files(directory)

    # refine them

    # calculate the BOW representation
    word_counts = bagOfWords(files.data)

    # TFIDF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
    X_tfidf = tf_transformer.transform(word_counts)

    X = X_tfidf

    #cross validation
    # clf = sklearn.naive_bayes.MultinomialNB()
    # clf = sklearn.svm.LinearSVC()
    n_neighbors = 5
    weights = 'uniform'
    # weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    scores = cross_validation(X, files.target, clf, cv=5)
    def read(self, filename):

        with open(filename, 'r') as f:
            x, y = f.readlines()

        trainx, trainy, testx, testy = cross_validation(x, y)
        return trainx, trainy, testx, testy
Exemplo n.º 4
def cross_validation_Halfaker(training, validation):
    logger.debug("Cross validation...")
    data = training.append(validation)
    data = data.undersample(1)
    logger.debug("Data size: %d" % len(data))
    logger.debug("Vandalism: %d" % data.getY().sum())

    clf = sklearn.ensemble.RandomForestClassifier(verbose=0,

    cv = sklearn.cross_validation.StratifiedKFold(data.getY(),
    cross_validation(clf, data, 'roc_auc', cv)
    cross_validation(clf, data, SCORERS['pr_auc'], cv)
Exemplo n.º 7
def video_cross_validation(video_list):
    global NB_accuracy_array, NB_precision_array, NB_recall_array
    global SVM_accuracy_array, SVM_precision_array, SVM_recall_array
    global LDA_accuracy_array, LDA_precision_array, LDA_recall_array

    NB_accuracy_array = []
    NB_precision_array = []
    NB_recall_array = []

    SVM_accuracy_array = []
    SVM_precision_array = []
    SVM_recall_array = []

    LDA_accuracy_array = []
    LDA_precision_array = []
    LDA_recall_array = []

    for i in range(0,len(video_list)) :
	test = video_list[i]
	train = video_list[0:i] + video_list[i+1:]

	print '\n-------------------------------------------------------------\nRound ',i,':'
	cross_validation(train, test)

    NB_acc = float(sum(NB_accuracy_array) / len(NB_accuracy_array))
    #SVM_acc = float(sum(SVM_accuracy_array) / len(SVM_accuracy_array))
    LDA_acc = float(sum(LDA_accuracy_array) / len(LDA_accuracy_array))

    NB_prec = float(sum(NB_precision_array) / len(NB_precision_array))
    LDA_prec = float(sum(LDA_precision_array) / len(LDA_precision_array))

    NB_rec = float(sum(NB_recall_array) / len(NB_recall_array))
    LDA_rec = float(sum(LDA_recall_array) / len(LDA_recall_array))

    print '\nTotal Results: \n- NB accuracy = {} % NB precision {} % NB recall {} % \n- LDA accuracy = {} % LDA precision {} % LDA recall {} % '.format(NB_acc, NB_prec, NB_rec, LDA_acc, LDA_prec, LDA_rec)
Exemplo n.º 8
def train_topic_classifier_cv(topics, classes, full_selection, max_depth,
                              features, classifier_fn, instance_weight_fn,
                              cross_validation, evaluation_measure, param_grid,
                              classifier_params) -> GridSearchCV:
    tuned_clf = CategorySelectionClassifier(
    clf = GridSearchCV(estimator=tuned_clf,
    clf.fit(topics, classes)
    return clf
Exemplo n.º 11
    training_data = Preparation.read_in(sys.argv[1], sys.argv[2])


    # Unzip labels and texts into separate lists:
    labels, feature_vectors = zip(*training_data)

    vectorizer = DictVectorizer()

    X = vectorizer.fit_transform(feature_vectors)
    y = labels

    # Train a classifier
    print("Starting the cross-validation...")


#     Print Instructions                                           #



    print("python3 cross_validation.py spam_data.json ham_data.json \n")
    print("spam_data.json: labeled spam data in JSON format.")
    print("ham_data.json: labeled ham data in JSON format.\n\n")
C = 2.0  # SVM regularization parameter
degree = 6  # Polynomial degree
Exemplo n.º 19
    for i, mse in enumerate(
        [MSEId, MSERegr, MSERegr_ey, MSERegr_log, MSE1NN, MSE2NN, MSE3NN]):
        print(names[i] + ":  ", mse)

N = 31
path = 'data_all/'

with open(path + "data_all.pkl", 'rb') as f:
    data_all = pickle.load(f)

allX, allId, allY, allRegr, allRegr_ey, allRegr_log, all1NN, all2NN, all3NN = [], [], [], [], [], [], [], [], []

for data in data_all:

    resultsX, resultsId, resultsY, resultsRegr, resultsRegr_ey, resultsRegr_log, results1NN, results2NN, results3NN = cross_validation(
        data['stimulus'], data['converted'])
    allX += resultsX
    allId += resultsId
    allY += resultsY
    allRegr += resultsRegr
    allRegr_ey += resultsRegr_ey
    allRegr_log += resultsRegr_log
    all1NN += results1NN
    all2NN += results2NN
    all3NN += results3NN

MSEId, MSERegr, MSERegr_ey, MSERegr_log, MSE1NN, MSE2NN, MSE3NN = mse(
    allX, allId), mse(allY, allRegr), mse(allY, allRegr_ey), mse(
        allY, allRegr_log), mse(allY, all1NN), mse(allY,
                                                   all2NN), mse(allY, all3NN)