def validate(test_set): win_size = 10 ep_test, _ = get_labeled(test_set, True) X_test = [window(ep.description, win_size) for ep in ep_test] y_true = [ep.guest for ep in ep_test] X_test = tokenizer.texts_to_sequences(X_test) X_test = pad_sequences(X_test, maxlen=2*win_size+1) y_predicted = model.predict(X_test) log_results(np.rint(y_predicted), y_true, model_path)
def train_and_validate(ep_train, ep_test=None): X_train = [window(ep.description, 10) for ep in ep_train] y_train = [ep.guest for ep in ep_train] X_test = [window(ep.description, 10) for ep in ep_test] y_test = [ep.guest for ep in ep_test] print("Fitting on corpus") vec = TfidfVectorizer() vec.fit(X_train) print("Transfroming") X_train = vec.transform(X_train) X_test = vec.transform(X_test) classifier = linear_model.LogisticRegression( penalty="l2", dual=True, solver="liblinear" ) print("Fitting model") model = classifier.fit(X_train, y_train) print("Predicting") y_predicted = model.predict(X_test) log_results(y_predicted, y_test, comment)
def make_xy(episodes): random.shuffle(episodes) split_index = int(len(episodes) * 0.8) X = [window(ep.description, win_size) for ep in episodes] # X = [window(ep.description, win_size) for ep in episodes] # X = [ep.focus_sentence() for ep in episodes] y = [ep.guest for ep in episodes] episodes = None X_train = X[:split_index] y_train = y[:split_index] X_val = X[split_index:] y_val = y[split_index:] return X_train, y_train, X_val, y_val
def get_best(episodes, model, tokenizer, n): """ Predicts unlabeled sampels and retrus the best ones """ X = [window(ep.description, win_size) for ep in episodes] X = tokenizer.texts_to_sequences(X) X = pad_sequences(X, maxlen=max_length) y_predicted = model.predict(X) X = None for ep, y in zip(episodes, y_predicted): ep.guest = float(y[0]) #episodes = np.array(sorted(episodes, key=lambda x: x.guest)) best = [] non_best = [] n_T = 0 n_G = 0 for ep in episodes: if ep.guest > 0.95 and n_G <= n: n_G += 1 ep.guest = 1 best.append(ep) elif ep.guest < 0.05 and n_T <= n: n_T += 1 ep.guest = 0 best.append(ep) else: non_best.append(ep) episodes = best #non_best = episodes[n:-n] #episodes = np.concatenate((episodes[:n], episodes[-n:]), axis=None) random.shuffle(episodes) random.shuffle(non_best) #for ep in episodes: # ep.guest = round(ep.guest) return episodes, non_best
from keras.preprocessing.text import one_hot, Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from tag_names import tag_names from episode import Episode, window import numpy as np """ Used for demostrating the model by typing made up samples. """ model_file_name = sys.argv[1] with open(model_file_name, 'rb') as f: model, tokenizer = pickle.load(f) while True: win_size = 10 text = input("Type an episode description: \n") tagged_text = tag_names(text) ep = Episode("", tagged_text) for text, name in ep.tokenize(): text = window(text, win_size) text = tokenizer.texts_to_sequences([text]) text = pad_sequences(text, maxlen=2 * win_size + 1) y = model.predict(text) if y[0] > 0.5: print(name, "is a Guest", str(np.round(y[0][0] * 100)), "%") else: print(name, "is a Topic", str(np.round((1 - y[0][0]) * 100)), "%") print()