示例#1
0
def validate(test_set):
    win_size = 10
    ep_test, _ = get_labeled(test_set, True)
    X_test = [window(ep.description, win_size) for ep in ep_test]
    y_true = [ep.guest for ep in ep_test]

    X_test = tokenizer.texts_to_sequences(X_test)
    X_test = pad_sequences(X_test, maxlen=2*win_size+1)

    y_predicted = model.predict(X_test)

    log_results(np.rint(y_predicted), y_true, model_path)
示例#2
0
def train_and_validate(ep_train, ep_test=None):
    X_train = [window(ep.description, 10) for ep in ep_train]
    y_train = [ep.guest for ep in ep_train]
    X_test = [window(ep.description, 10) for ep in ep_test]
    y_test = [ep.guest for ep in ep_test]

    print("Fitting on corpus")
    vec = TfidfVectorizer()
    vec.fit(X_train)

    print("Transfroming")
    X_train = vec.transform(X_train)
    X_test = vec.transform(X_test)
    classifier = linear_model.LogisticRegression(
        penalty="l2", dual=True, solver="liblinear"
    )
    print("Fitting model")
    model = classifier.fit(X_train, y_train)

    print("Predicting")
    y_predicted = model.predict(X_test)

    log_results(y_predicted, y_test, comment)
示例#3
0
def make_xy(episodes):
    random.shuffle(episodes)
    split_index = int(len(episodes) * 0.8)

    X = [window(ep.description, win_size) for ep in episodes]
    # X = [window(ep.description, win_size) for ep in episodes]
    # X = [ep.focus_sentence() for ep in episodes]
    y = [ep.guest for ep in episodes]
    episodes = None

    X_train = X[:split_index]
    y_train = y[:split_index]

    X_val = X[split_index:]
    y_val = y[split_index:]
    return X_train, y_train, X_val, y_val
示例#4
0
def get_best(episodes, model, tokenizer, n):
    """
    Predicts unlabeled sampels and retrus the best ones
    """
    X = [window(ep.description, win_size) for ep in episodes]
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=max_length)
    y_predicted = model.predict(X)
    X = None

    for ep, y in zip(episodes, y_predicted):
        ep.guest = float(y[0])

    #episodes = np.array(sorted(episodes, key=lambda x: x.guest))
    best = []
    non_best = []
    n_T = 0
    n_G = 0
    for ep in episodes:
        if ep.guest > 0.95 and n_G <= n:
            n_G += 1
            ep.guest = 1
            best.append(ep)
        elif ep.guest < 0.05 and n_T <= n:
            n_T += 1
            ep.guest = 0
            best.append(ep)
        else:
            non_best.append(ep)
    episodes = best

    #non_best = episodes[n:-n]
    #episodes = np.concatenate((episodes[:n], episodes[-n:]), axis=None)
    random.shuffle(episodes)
    random.shuffle(non_best)

    #for ep in episodes:
    #    ep.guest = round(ep.guest)

    return episodes, non_best
示例#5
0
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from tag_names import tag_names
from episode import Episode, window
import numpy as np
"""
Used for demostrating the model by typing made up samples.
"""

model_file_name = sys.argv[1]

with open(model_file_name, 'rb') as f:
    model, tokenizer = pickle.load(f)

while True:
    win_size = 10
    text = input("Type an episode description: \n")
    tagged_text = tag_names(text)
    ep = Episode("", tagged_text)
    for text, name in ep.tokenize():
        text = window(text, win_size)
        text = tokenizer.texts_to_sequences([text])
        text = pad_sequences(text, maxlen=2 * win_size + 1)
        y = model.predict(text)
        if y[0] > 0.5:
            print(name, "is a Guest", str(np.round(y[0][0] * 100)), "%")
        else:
            print(name, "is a Topic", str(np.round((1 - y[0][0]) * 100)), "%")
    print()