def cross_validator(n_folds, params, requests): X, y = get_features(requests) total_accuracy = 0. count = 0 fold_length = len(requests)/n_folds curr_fold = 1 while count < len(requests)-fold_length+1: print "Fold # %d" % curr_fold if count == 0: train_requests = requests[fold_length:] else: end = count+fold_length train_requests = np.concatenate(( requests[0:count], requests[end:])) PolitenessFeatureVectorizer.generate_bow_features(train_requests) params["X_train"], params["X_val"] = get_subsets(X, count, fold_length) params["y_train"], params["y_val"] = get_subsets(y, count, fold_length) count += fold_length curr_fold += 1 total_accuracy += tf_train(params) # take average of all accuracies print "****** Average Accuracy for all folds: ", total_accuracy/n_folds temp = str(params["training_epochs"]) temp += "/" + str(params["n_hidden_1"]) if params["n_layers"] == 2: temp += "/" + str(params["n_hidden_2"]) print temp print "----------------------------------------" return total_accuracy/n_folds
def train_svm(documents): """ :param documents- politeness-annotated training data :type documents- list of dicts each document contains a 'text' field with the text of it. :param ntesting- number of docs to reserve for testing :type ntesting- int returns fitted SVC, which can be serialized using cPickle """ # Generate and persist list of unigrams, bigrams documents = PolitenessFeatureVectorizer.preprocess(documents) with open("features.json", "w") as w: json.dump(documents, w) print "DUMPED" PolitenessFeatureVectorizer.generate_bow_features(documents) # For good luck random.shuffle(documents) X, y = documents2feature_vectors(documents) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) # loocv = LeaveOneOut() # scores = cross_val_score(clf, X, y, cv=loocv) clf.fit(X, y) # print(scores.mean()) # print scores return clf
def train_svm(documents, ntesting=500): """ :param documents- politeness-annotated training data :type documents- list of dicts each document must be preprocessed and 'sentences' and 'parses' and 'score' fields. :param ntesting- number of docs to reserve for testing :type ntesting- int returns fitted SVC, which can be serialized using cPickle """ # Generate and persist list of unigrams, bigrams PolitenessFeatureVectorizer.generate_bow_features(documents) # For good luck random.shuffle(documents) testing = documents[-ntesting:] documents = documents[:-ntesting] # SAVE FOR NOW cPickle.dump(testing, open("testing-data.p", 'w')) X, y = documents2feature_vectors(documents) Xtest, ytest = documents2feature_vectors(testing) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) clf.fit(X, y) # Test y_pred = clf.predict(Xtest) print(classification_report(ytest, y_pred)) return clf
def indomain(documents_stack, documents_wiki): print "In Domain" PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow) X_stack, y_stack = documents2feature_vectors(documents_stack) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) scores = cross_validation.cross_val_score(clf, X_stack, y_stack, cv=10) print "In doman for stack" print scores print np.mean(scores) print "------------------------------------------------------" PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow) X_wiki, y_wiki = documents2feature_vectors(documents_wiki) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) scores = cross_validation.cross_val_score(clf, X_wiki, y_wiki, cv=10) print "In doman for wiki" print scores print "Mean: ", np.mean(scores)
def documents2feature_vectors(documents): vectorizer = PolitenessFeatureVectorizer() fks = False X, y = [], [] for d in documents: fs = vectorizer.features(d) if not fks: fks = sorted(fs.keys()) fv = [fs[f] for f in fks] # If politeness score > 0.0, # the doc is polite, class=1 l = 1 if d['score'] > 0.0 else 0 X.append(fv) y.append(l) X = csr_matrix(np.asarray(X)) y = np.asarray(y) return X, y
def get_features(requests): vectorizer = PolitenessFeatureVectorizer() fks = False X, y = [], [] for req in requests: # get unigram, bigram features + politeness strategy features # in this specific document # vectorizer returns {feature-name: bool_value} dict # a matrix of zeros and ones fs = vectorizer.features(req) if not fks: fks = sorted(fs.keys()) # get features vector fv = [fs[k] for k in fks] # If politeness score > 0.0, # the doc is polite, class = 1 if req['score'] > 0.0: l = 1 else: l = 0 X.append(fv) y.append(l) # Single-row sparse matrix # where np.asarray converts the input to an array. #X = csr_matrix(np.asarray(X)) X = np.asarray(X) # format y = np.asarray(y) y_ = np.zeros((len(y), 2)) for i in range(len(y)): if y[i] == 1: y_[i][1] = 1 else: y_[i][0] = 1 y = y_ return X, y
def crossdomain(documents_stack, documents_wiki): print "Cross Domain" # documents_stack=stack_data.values() # documents_wiki=wiki_data.values() PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow) X_stack, y_stack = documents2feature_vectors(documents_stack) X_wiki, y_wiki = documents2feature_vectors(documents_wiki) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) # clf = RandomForestClassifier(n_estimators=50) clf.fit(X_stack, y_stack) y_pred = clf.predict(X_wiki) print "Trained on Stack and results predicted for wiki" # Test #print(classification_report(y_wiki, y_pred)) print(clf.score(X_wiki, y_wiki)) print "------------------------------------------------------" PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow) X_stack, y_stack = documents2feature_vectors(documents_stack) X_wiki, y_wiki = documents2feature_vectors(documents_wiki) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) # clf = RandomForestClassifier(n_estimators=50) clf.fit(X_wiki, y_wiki) y_pred = clf.predict(X_stack) print "Trained on wiki and results predicted for stack" # Test #print(classification_report(y_stack, y_pred)) print(clf.score(X_stack, y_stack)) print "------------------------------------------------------"
% (name, expected_v, package.__version__)) #### from features.vectorizer import PolitenessFeatureVectorizer #### # Serialized model filename MODEL_FILENAME = os.path.join(os.path.split(__file__)[0], 'politeness-svm.p') #### # Load model, initialize vectorizer clf = cPickle.load(open(MODEL_FILENAME)) vectorizer = PolitenessFeatureVectorizer() def score(request): """ :param request - The request document to score :type request - dict with 'sentences' and 'parses' field sample (taken from test2.py)-- { 'sentences': [ "Have you found the answer for your question?", "If yes would you please share it?" ], 'parses': [ ["csubj(found-3, Have-1)", "dobj(Have-1, you-2)", "root(ROOT-0, found-3)", "det(answer-5, the-4)", "dobj(found-3, answer-5)", "poss(question-8, your-7)", "prep_for(found-3, question-8)"], ["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)", "nsubj(would-3, you-4)", "ccomp(would-3, please-5)", "nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
sys.exit(2) #### from features.vectorizer import PolitenessFeatureVectorizer #### # Serialized model filename MODEL_FILENAME = os.path.join(os.path.split(__file__)[0], 'politeness-svm.p') #### # Load model, initialize vectorizer clf = cPickle.load(open(MODEL_FILENAME)) vectorizer = PolitenessFeatureVectorizer() def score(request): """ :param request - The request document to score :type requests - list of dict with 'text' field sample -- { 'text': [ "Have you found the answer for your question? If yes would you please share it?" ], } returns class probabilities as a dict {
# from test_documents import TEST_DOCUMENTS import json from nltk import word_tokenize, sent_tokenize from features.vectorizer import PolitenessFeatureVectorizer from dependency_parse import get_parses import pprint from run_model import score_text vectorizer = PolitenessFeatureVectorizer() TEST_DOCUMENTS = [] TEST_DOCUMENTS.append( get_parses( "Have you found the answer for your question? If yes would you please share it?" )) TEST_DOCUMENTS.append( get_parses("So you think this is correct. Are you sure?")) TEST_DOCUMENTS.append( get_parses( "Sorry :) I dont want to hack the system!! :) is there another way?")) TEST_DOCUMENTS.append( get_parses( "What are you trying to do? Why can't you just store the \"Range\"?")) TEST_DOCUMENTS.append( get_parses("That is weird. Why can't you just store the \"Range\"?")) TEST_DOCUMENTS.append( get_parses( "This was supposed to have been moved to <url> per the cfd. why wasn't it moved?" )) TEST_DOCUMENTS.append(get_parses("You are wrong. But the approach is correct")) # TEST_DOCUMENTS.append(get_parses(""))