示例#1
0
def train_svm(documents):
    """
    :param documents- politeness-annotated training data
    :type documents- list of dicts
        each document contains a 'text' field with the text of it.

    :param ntesting- number of docs to reserve for testing
    :type ntesting- int

    returns fitted SVC, which can be serialized using cPickle
    """
    # Generate and persist list of unigrams, bigrams
    documents = PolitenessFeatureVectorizer.preprocess(documents) 
    with open("features.json", "w") as w:
         json.dump(documents, w)
    print "DUMPED"

    PolitenessFeatureVectorizer.generate_bow_features(documents)

    # For good luck
    random.shuffle(documents)
    X, y = documents2feature_vectors(documents)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
 #   loocv = LeaveOneOut()
 #   scores = cross_val_score(clf, X, y, cv=loocv)
    clf.fit(X, y)

#    print(scores.mean())
#    print scores

    return clf
示例#2
0
        }
    """
    # vectorizer returns {feature-name: value} dict
    features = vectorizer.features(request)
    fv = [features[f] for f in sorted(features.iterkeys())]
    # Single-row sparse matrix
    X = csr_matrix(np.asarray([fv]))
    probs = clf.predict_proba(X)
    # Massage return format
    probs = {"polite": probs[0][1], "impolite": probs[0][0]}
    return probs


if __name__ == "__main__":
    """
    Sample classification of requests
    """

    from test_documents import TEST_DOCUMENTS
    acc = 0
    TEST_DOCUMENTS = PolitenessFeatureVectorizer.preprocess(TEST_DOCUMENTS)
    for doc in TEST_DOCUMENTS:

        probs = score(doc)

        print("====================")
        print("Text: ", doc['text'])
        print("\tP(polite) = %.3f" % probs['polite'])
        print("\tP(impolite) = %.3f" % probs['impolite'])
        print("\n")