示例#1
0
def cross_validator(n_folds, params, requests):
    X, y = get_features(requests)

    total_accuracy = 0.
    count = 0
    fold_length = len(requests)/n_folds
    curr_fold = 1
    while count < len(requests)-fold_length+1:
        print "Fold # %d" % curr_fold

        if count == 0:
            train_requests = requests[fold_length:]
        else:
            end = count+fold_length
            train_requests  = np.concatenate((
                requests[0:count], requests[end:]))
        PolitenessFeatureVectorizer.generate_bow_features(train_requests)

        params["X_train"], params["X_val"] = get_subsets(X, count, fold_length)
        params["y_train"], params["y_val"] = get_subsets(y, count, fold_length)

        count += fold_length
        curr_fold += 1
        total_accuracy += tf_train(params)
    # take average of all accuracies
    print "****** Average Accuracy for all folds: ", total_accuracy/n_folds
    temp = str(params["training_epochs"])
    temp += "/" + str(params["n_hidden_1"])
    if params["n_layers"] == 2:
        temp += "/" + str(params["n_hidden_2"])
    print temp
    print "----------------------------------------"
    return total_accuracy/n_folds
示例#2
0
def train_svm(documents):
    """
    :param documents- politeness-annotated training data
    :type documents- list of dicts
        each document contains a 'text' field with the text of it.

    :param ntesting- number of docs to reserve for testing
    :type ntesting- int

    returns fitted SVC, which can be serialized using cPickle
    """
    # Generate and persist list of unigrams, bigrams
    documents = PolitenessFeatureVectorizer.preprocess(documents) 
    with open("features.json", "w") as w:
         json.dump(documents, w)
    print "DUMPED"

    PolitenessFeatureVectorizer.generate_bow_features(documents)

    # For good luck
    random.shuffle(documents)
    X, y = documents2feature_vectors(documents)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
 #   loocv = LeaveOneOut()
 #   scores = cross_val_score(clf, X, y, cv=loocv)
    clf.fit(X, y)

#    print(scores.mean())
#    print scores

    return clf
示例#3
0
def train_svm(documents, ntesting=500):
    """
    :param documents- politeness-annotated training data
    :type documents- list of dicts
        each document must be preprocessed and
        'sentences' and 'parses' and 'score' fields.

    :param ntesting- number of docs to reserve for testing
    :type ntesting- int

    returns fitted SVC, which can be serialized using cPickle
    """
    # Generate and persist list of unigrams, bigrams  
    PolitenessFeatureVectorizer.generate_bow_features(documents)

    # For good luck
    random.shuffle(documents)
    testing = documents[-ntesting:]
    documents = documents[:-ntesting]

    # SAVE FOR NOW
    cPickle.dump(testing, open("testing-data.p", 'w'))

    X, y = documents2feature_vectors(documents)
    Xtest, ytest = documents2feature_vectors(testing)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    clf.fit(X, y)

    # Test
    y_pred = clf.predict(Xtest)
    print(classification_report(ytest, y_pred))

    return clf
def indomain(documents_stack, documents_wiki):
    print "In Domain"
    
    PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow)

    X_stack, y_stack = documents2feature_vectors(documents_stack)
    
    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    scores = cross_validation.cross_val_score(clf, X_stack, y_stack, cv=10)
    print "In doman for stack"
    print scores
    print np.mean(scores)

    print "------------------------------------------------------"

    PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow)
    X_wiki, y_wiki = documents2feature_vectors(documents_wiki)
    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    scores = cross_validation.cross_val_score(clf, X_wiki, y_wiki, cv=10)
    print "In doman for wiki"
    print scores

    print "Mean: ", np.mean(scores)
示例#5
0
def documents2feature_vectors(documents):
    vectorizer = PolitenessFeatureVectorizer()
    fks = False
    X, y = [], []
    for d in documents:
        fs = vectorizer.features(d)
        if not fks:
            fks = sorted(fs.keys())
        fv = [fs[f] for f in fks]
        # If politeness score > 0.0,
        # the doc is polite, class=1
        l = 1 if d['score'] > 0.0 else 0
        X.append(fv)
        y.append(l)
    X = csr_matrix(np.asarray(X))
    y = np.asarray(y)
    return X, y
def documents2feature_vectors(documents):
    vectorizer = PolitenessFeatureVectorizer()
    fks = False
    X, y = [], []
    for d in documents:
        fs = vectorizer.features(d)
        if not fks:
            fks = sorted(fs.keys())
        fv = [fs[f] for f in fks]
        # If politeness score > 0.0, 
        # the doc is polite, class=1
        l = 1 if d['score'] > 0.0 else 0
        X.append(fv)
        y.append(l)
    X = csr_matrix(np.asarray(X))
    y = np.asarray(y)
    return X, y
示例#7
0
def get_features(requests):
    vectorizer = PolitenessFeatureVectorizer()
    fks = False
    X, y = [], []
    for req in requests:
        # get unigram, bigram features + politeness strategy features
        # in this specific document
        # vectorizer returns {feature-name: bool_value} dict
        # a matrix of zeros and ones
        fs = vectorizer.features(req)
        if not fks:
            fks = sorted(fs.keys())
        # get features vector
        fv = [fs[k] for k in fks]
        # If politeness score > 0.0, 
        # the doc is polite, class = 1
        if req['score'] > 0.0:
            l = 1 
        else:
            l = 0
        X.append(fv)
        y.append(l)
    # Single-row sparse matrix
    # where np.asarray converts the input to an array.
    #X = csr_matrix(np.asarray(X))
    X = np.asarray(X)
    # format 
    y = np.asarray(y)
    y_ = np.zeros((len(y), 2)) 
    for i in range(len(y)):
        if y[i] == 1:
            y_[i][1] = 1
        else:
            y_[i][0] = 1
    y = y_
    return X, y
def crossdomain(documents_stack, documents_wiki):
    print "Cross Domain"
    # documents_stack=stack_data.values() 
    # documents_wiki=wiki_data.values()
    PolitenessFeatureVectorizer.generate_bow_features(documents_stack, bow)

    X_stack, y_stack = documents2feature_vectors(documents_stack)
    X_wiki, y_wiki = documents2feature_vectors(documents_wiki)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    # clf = RandomForestClassifier(n_estimators=50)
    clf.fit(X_stack, y_stack)
    y_pred = clf.predict(X_wiki)
    print "Trained on Stack and results predicted for wiki" 
    # Test
    #print(classification_report(y_wiki, y_pred))
    print(clf.score(X_wiki, y_wiki))

    print "------------------------------------------------------"

    PolitenessFeatureVectorizer.generate_bow_features(documents_wiki, bow)

    X_stack, y_stack = documents2feature_vectors(documents_stack)
    X_wiki, y_wiki = documents2feature_vectors(documents_wiki)

    print "Fitting"
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    # clf = RandomForestClassifier(n_estimators=50)
    clf.fit(X_wiki, y_wiki)
    y_pred = clf.predict(X_stack)
    print "Trained on wiki and results predicted for stack" 
    # Test
    #print(classification_report(y_stack, y_pred))
    print(clf.score(X_stack, y_stack))
    print "------------------------------------------------------"
示例#9
0
            % (name, expected_v, package.__version__))

####

from features.vectorizer import PolitenessFeatureVectorizer

####
# Serialized model filename

MODEL_FILENAME = os.path.join(os.path.split(__file__)[0], 'politeness-svm.p')

####
# Load model, initialize vectorizer

clf = cPickle.load(open(MODEL_FILENAME))
vectorizer = PolitenessFeatureVectorizer()


def score(request):
    """
    :param request - The request document to score
    :type request - dict with 'sentences' and 'parses' field
        sample (taken from test2.py)--
        {
            'sentences': [
                "Have you found the answer for your question?", 
                "If yes would you please share it?"
            ],
            'parses': [
                ["csubj(found-3, Have-1)", "dobj(Have-1, you-2)", "root(ROOT-0, found-3)", "det(answer-5, the-4)", "dobj(found-3, answer-5)", "poss(question-8, your-7)", "prep_for(found-3, question-8)"], 
                ["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)", "nsubj(would-3, you-4)", "ccomp(would-3, please-5)", "nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
示例#10
0
    sys.exit(2)

####

from features.vectorizer import PolitenessFeatureVectorizer

####
# Serialized model filename

MODEL_FILENAME = os.path.join(os.path.split(__file__)[0], 'politeness-svm.p')

####
# Load model, initialize vectorizer

clf = cPickle.load(open(MODEL_FILENAME))
vectorizer = PolitenessFeatureVectorizer()


def score(request):
    """
    :param request - The request document to score
    :type requests - list of dict with 'text' field
        sample --
        {
            'text': [
                "Have you found the answer for your question? If yes would you please share it?"
            ],
        }

    returns class probabilities as a dict
        {
示例#11
0
# from test_documents import TEST_DOCUMENTS
import json
from nltk import word_tokenize, sent_tokenize
from features.vectorizer import PolitenessFeatureVectorizer
from dependency_parse import get_parses
import pprint
from run_model import score_text

vectorizer = PolitenessFeatureVectorizer()
TEST_DOCUMENTS = []
TEST_DOCUMENTS.append(
    get_parses(
        "Have you found the answer for your question? If yes would you please share it?"
    ))
TEST_DOCUMENTS.append(
    get_parses("So you think this is correct. Are you sure?"))
TEST_DOCUMENTS.append(
    get_parses(
        "Sorry :) I dont want to hack the system!! :) is there another way?"))
TEST_DOCUMENTS.append(
    get_parses(
        "What are you trying to do?  Why can't you just store the \"Range\"?"))
TEST_DOCUMENTS.append(
    get_parses("That is weird.  Why can't you just store the \"Range\"?"))
TEST_DOCUMENTS.append(
    get_parses(
        "This was supposed to have been moved to &lt;url&gt; per the cfd. why wasn't it moved?"
    ))
TEST_DOCUMENTS.append(get_parses("You are wrong. But the approach is correct"))

# TEST_DOCUMENTS.append(get_parses(""))