def train_svm(documents): """ :param documents- politeness-annotated training data :type documents- list of dicts each document contains a 'text' field with the text of it. :param ntesting- number of docs to reserve for testing :type ntesting- int returns fitted SVC, which can be serialized using cPickle """ # Generate and persist list of unigrams, bigrams documents = PolitenessFeatureVectorizer.preprocess(documents) with open("features.json", "w") as w: json.dump(documents, w) print "DUMPED" PolitenessFeatureVectorizer.generate_bow_features(documents) # For good luck random.shuffle(documents) X, y = documents2feature_vectors(documents) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) # loocv = LeaveOneOut() # scores = cross_val_score(clf, X, y, cv=loocv) clf.fit(X, y) # print(scores.mean()) # print scores return clf
} """ # vectorizer returns {feature-name: value} dict features = vectorizer.features(request) fv = [features[f] for f in sorted(features.iterkeys())] # Single-row sparse matrix X = csr_matrix(np.asarray([fv])) probs = clf.predict_proba(X) # Massage return format probs = {"polite": probs[0][1], "impolite": probs[0][0]} return probs if __name__ == "__main__": """ Sample classification of requests """ from test_documents import TEST_DOCUMENTS acc = 0 TEST_DOCUMENTS = PolitenessFeatureVectorizer.preprocess(TEST_DOCUMENTS) for doc in TEST_DOCUMENTS: probs = score(doc) print("====================") print("Text: ", doc['text']) print("\tP(polite) = %.3f" % probs['polite']) print("\tP(impolite) = %.3f" % probs['impolite']) print("\n")