def train_svm(documents): """ :param documents- politeness-annotated training data :type documents- list of dicts each document contains a 'text' field with the text of it. :param ntesting- number of docs to reserve for testing :type ntesting- int returns fitted SVC, which can be serialized using cPickle """ # Generate and persist list of unigrams, bigrams documents = PolitenessFeatureVectorizer.preprocess(documents) with open("features.json", "w") as w: json.dump(documents, w) print "DUMPED" PolitenessFeatureVectorizer.generate_bow_features(documents) # For good luck random.shuffle(documents) X, y = documents2feature_vectors(documents) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) # loocv = LeaveOneOut() # scores = cross_val_score(clf, X, y, cv=loocv) clf.fit(X, y) # print(scores.mean()) # print scores return clf
def cross_validator(n_folds, params, requests): X, y = get_features(requests) total_accuracy = 0. count = 0 fold_length = len(requests)/n_folds curr_fold = 1 while count < len(requests)-fold_length+1: print "Fold # %d" % curr_fold if count == 0: train_requests = requests[fold_length:] else: end = count+fold_length train_requests = np.concatenate(( requests[0:count], requests[end:])) PolitenessFeatureVectorizer.generate_bow_features(train_requests) params["X_train"], params["X_val"] = get_subsets(X, count, fold_length) params["y_train"], params["y_val"] = get_subsets(y, count, fold_length) count += fold_length curr_fold += 1 total_accuracy += tf_train(params) # take average of all accuracies print "****** Average Accuracy for all folds: ", total_accuracy/n_folds temp = str(params["training_epochs"]) temp += "/" + str(params["n_hidden_1"]) if params["n_layers"] == 2: temp += "/" + str(params["n_hidden_2"]) print temp print "----------------------------------------" return total_accuracy/n_folds
def indomain(documents_stack, documents_wiki): print "In Domain" PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow) X_stack, y_stack = documents2feature_vectors(documents_stack) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) scores = cross_validation.cross_val_score(clf, X_stack, y_stack, cv=10) print "In doman for stack" print scores print np.mean(scores) print "------------------------------------------------------" PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow) X_wiki, y_wiki = documents2feature_vectors(documents_wiki) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) scores = cross_validation.cross_val_score(clf, X_wiki, y_wiki, cv=10) print "In doman for wiki" print scores print "Mean: ", np.mean(scores)
def train_svm(documents, ntesting=500): """ :param documents- politeness-annotated training data :type documents- list of dicts each document must be preprocessed and 'sentences' and 'parses' and 'score' fields. :param ntesting- number of docs to reserve for testing :type ntesting- int returns fitted SVC, which can be serialized using cPickle """ # Generate and persist list of unigrams, bigrams PolitenessFeatureVectorizer.generate_bow_features(documents) # For good luck random.shuffle(documents) testing = documents[-ntesting:] documents = documents[:-ntesting] # SAVE FOR NOW cPickle.dump(testing, open("testing-data.p", 'w')) X, y = documents2feature_vectors(documents) Xtest, ytest = documents2feature_vectors(testing) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) clf.fit(X, y) # Test y_pred = clf.predict(Xtest) print(classification_report(ytest, y_pred)) return clf
def crossdomain(documents_stack, documents_wiki): print "Cross Domain" # documents_stack=stack_data.values() # documents_wiki=wiki_data.values() PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow) X_stack, y_stack = documents2feature_vectors(documents_stack) X_wiki, y_wiki = documents2feature_vectors(documents_wiki) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) # clf = RandomForestClassifier(n_estimators=50) clf.fit(X_stack, y_stack) y_pred = clf.predict(X_wiki) print "Trained on Stack and results predicted for wiki" # Test #print(classification_report(y_wiki, y_pred)) print(clf.score(X_wiki, y_wiki)) print "------------------------------------------------------" PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow) X_stack, y_stack = documents2feature_vectors(documents_stack) X_wiki, y_wiki = documents2feature_vectors(documents_wiki) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) # clf = RandomForestClassifier(n_estimators=50) clf.fit(X_wiki, y_wiki) y_pred = clf.predict(X_stack) print "Trained on wiki and results predicted for stack" # Test #print(classification_report(y_stack, y_pred)) print(clf.score(X_stack, y_stack)) print "------------------------------------------------------"