示例#1
0
def train_svm(documents):
    """
    :param documents- politeness-annotated training data
    :type documents- list of dicts
        each document contains a 'text' field with the text of it.

    :param ntesting- number of docs to reserve for testing
    :type ntesting- int

    returns fitted SVC, which can be serialized using cPickle
    """
    # Generate and persist list of unigrams, bigrams
    documents = PolitenessFeatureVectorizer.preprocess(documents) 
    with open("features.json", "w") as w:
         json.dump(documents, w)
    print "DUMPED"

    PolitenessFeatureVectorizer.generate_bow_features(documents)

    # For good luck
    random.shuffle(documents)
    X, y = documents2feature_vectors(documents)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
 #   loocv = LeaveOneOut()
 #   scores = cross_val_score(clf, X, y, cv=loocv)
    clf.fit(X, y)

#    print(scores.mean())
#    print scores

    return clf
示例#2
0
def cross_validator(n_folds, params, requests):
    X, y = get_features(requests)

    total_accuracy = 0.
    count = 0
    fold_length = len(requests)/n_folds
    curr_fold = 1
    while count < len(requests)-fold_length+1:
        print "Fold # %d" % curr_fold

        if count == 0:
            train_requests = requests[fold_length:]
        else:
            end = count+fold_length
            train_requests  = np.concatenate((
                requests[0:count], requests[end:]))
        PolitenessFeatureVectorizer.generate_bow_features(train_requests)

        params["X_train"], params["X_val"] = get_subsets(X, count, fold_length)
        params["y_train"], params["y_val"] = get_subsets(y, count, fold_length)

        count += fold_length
        curr_fold += 1
        total_accuracy += tf_train(params)
    # take average of all accuracies
    print "****** Average Accuracy for all folds: ", total_accuracy/n_folds
    temp = str(params["training_epochs"])
    temp += "/" + str(params["n_hidden_1"])
    if params["n_layers"] == 2:
        temp += "/" + str(params["n_hidden_2"])
    print temp
    print "----------------------------------------"
    return total_accuracy/n_folds
def indomain(documents_stack, documents_wiki):
    print "In Domain"
    
    PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow)

    X_stack, y_stack = documents2feature_vectors(documents_stack)
    
    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    scores = cross_validation.cross_val_score(clf, X_stack, y_stack, cv=10)
    print "In doman for stack"
    print scores
    print np.mean(scores)

    print "------------------------------------------------------"

    PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow)
    X_wiki, y_wiki = documents2feature_vectors(documents_wiki)
    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    scores = cross_validation.cross_val_score(clf, X_wiki, y_wiki, cv=10)
    print "In doman for wiki"
    print scores

    print "Mean: ", np.mean(scores)
示例#4
0
def train_svm(documents, ntesting=500):
    """
    :param documents- politeness-annotated training data
    :type documents- list of dicts
        each document must be preprocessed and
        'sentences' and 'parses' and 'score' fields.

    :param ntesting- number of docs to reserve for testing
    :type ntesting- int

    returns fitted SVC, which can be serialized using cPickle
    """
    # Generate and persist list of unigrams, bigrams  
    PolitenessFeatureVectorizer.generate_bow_features(documents)

    # For good luck
    random.shuffle(documents)
    testing = documents[-ntesting:]
    documents = documents[:-ntesting]

    # SAVE FOR NOW
    cPickle.dump(testing, open("testing-data.p", 'w'))

    X, y = documents2feature_vectors(documents)
    Xtest, ytest = documents2feature_vectors(testing)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    clf.fit(X, y)

    # Test
    y_pred = clf.predict(Xtest)
    print(classification_report(ytest, y_pred))

    return clf
def crossdomain(documents_stack, documents_wiki):
    print "Cross Domain"
    # documents_stack=stack_data.values() 
    # documents_wiki=wiki_data.values()
    PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow)

    X_stack, y_stack = documents2feature_vectors(documents_stack)
    X_wiki, y_wiki = documents2feature_vectors(documents_wiki)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    # clf = RandomForestClassifier(n_estimators=50)
    clf.fit(X_stack, y_stack)
    y_pred = clf.predict(X_wiki)
    print "Trained on Stack and results predicted for wiki" 
    # Test
    #print(classification_report(y_wiki, y_pred))
    print(clf.score(X_wiki, y_wiki))

    print "------------------------------------------------------"

    PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow)

    X_stack, y_stack = documents2feature_vectors(documents_stack)
    X_wiki, y_wiki = documents2feature_vectors(documents_wiki)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    # clf = RandomForestClassifier(n_estimators=50)
    clf.fit(X_wiki, y_wiki)
    y_pred = clf.predict(X_stack)
    print "Trained on wiki and results predicted for stack" 
    # Test
    #print(classification_report(y_stack, y_pred))
    print(clf.score(X_stack, y_stack))
    print "------------------------------------------------------"