def train_model(modeltype, delta): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print "Begin Training" df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t') X = clean(df_imdb_reviews['review'].values) y = df_imdb_reviews['sentiment'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) print "Tokenize" tokenizer = Tokenizer(min_df=10, max_features=100000) X_train = tokenizer.fit_transform(X_train) X_train = [[float(x) for x in y] for y in X_train] X_test = tokenizer.transform(X_test) X_test = [[float(x) for x in y] for y in X_test] print "Number of featers: {}".format(tokenizer.n_features) print "Training model" if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] # bce is classification loss for binary classification and sigmoid output model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta)) model.fit(X_train, y_train, n_epochs=20) with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: vectorizer = pickle.dump(tokenizer, f) with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: model = pickle.dump(model, f) try: y_pred_te = model.predict(X_test).flatten() >= 0.5 y_pred_tr = model.predict(X_train).flatten() >= 0.5 print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te)) print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te)) print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te)) print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr)) print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr)) print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr)) except: print "Unable to perform metrics" return tokenizer, model
trX, teX, trY, teY = load_gender_data( ntrain=10000) #Can increase up to 250K or so tokenizer = Tokenizer(min_df=10, max_features=50000) print trX[1:2] #see a blog example trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print tokenizer.inverse_transform(trX[1:2]) #see what words are kept print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') #sigmoid for binary classification ] model = RNN( layers=layers, cost='bce' ) #bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(trX, trY, n_epochs=1) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
################# # training data have to be lemmatized using Morphodita !!!!!!!!! ################# trX, teX, trY, teY = load_data(ntrain=9000, ntest=1000) print len(trX), len(trY), len(teX), len(teY) tokenizer = Tokenizer(min_df=10, max_features=50000) trX = tokenizer.fit_transform(trX) pickle.dump(tokenizer, open('tokenizer.pkl', 'wb')) print "number of tokens:" + str(len(trX)) teX = tokenizer.transform(teX) print "number of feathures:" + str(tokenizer.n_features) layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=725), Dense(size=10, activation='softmax') ] model = RNN(layers=layers, cost='cce') model.fit(trX, trY, n_epochs=10) save(model, 'modelEcho.pkl') tr_preds = model.predict(trX) te_preds = model.predict(teX) data = pd.DataFrame(trY) data.to_csv('data/trY.vec') data = pd.DataFrame(tr_preds) data.to_csv('data/tr_preds.vec')
import numpy as np from passage.models import RNN from passage.updates import NAG, Regularizer from passage.layers import Generic, GatedRecurrent, Dense from passage.utils import load, save from load import load_mnist trX, teX, trY, teY = load_mnist() #Use generic layer - RNN processes a size 28 vector at a time scanning from left to right layers = [ Generic(size=28), GatedRecurrent(size=512, p_drop=0.2), Dense(size=10, activation='softmax', p_drop=0.5) ] #A bit of l2 helps with generalization, higher momentum helps convergence updater = NAG(momentum=0.95, regularizer=Regularizer(l2=1e-4)) #Linear iterator for real valued data, cce cost for softmax model = RNN(layers=layers, updater=updater, iterator='linear', cost='cce') model.fit(trX, trY, n_epochs=20) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = np.mean(trY[:len(teY)] == np.argmax(tr_preds, axis=1)) te_acc = np.mean(teY == np.argmax(te_preds, axis=1))
train_tokens = [[1, 2, 4, 3, 6], [1, 2, 3], [3, 1, 2, 4, 3]] num_feats = len(set(flatten(train_tokens))) def get_labels(id): if id == 3: return [1, 0] else: return [0, 1] seq_labels = map(lambda (l): map(get_labels, l), train_tokens) layers = [ Embedding(size=128, n_features=num_feats), GatedRecurrent(size=128, seq_output=True), Dense(size=num_feats, activation='softmax') ] #iterator = SortedPadded(y_pad=True, y_dtype=intX) #iterator = SortedPadded(y_dtype=intX) #model = RNN(layers=layers, cost='seq_cce', iterator=iterator, Y=T.imatrix()) model = RNN(layers=layers, cost='seq_cce') #model.fit(train_tokens, [1,0,1]) model.fit(train_tokens, train_tokens) #model.predict(tokenizer.transform(["Frogs are awesome", "frogs are amphibious"])) model.predict(train_tokens) save(model, 'save_test.pkl') model = load('save_test.pkl')
def train_and_save_passage_tokenizer_and_rnn_model(x_train, y_train, x_test, character_model=False): """Train and save Passage tokenizer and Passage RNN model. x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed punct/#s x_train+x_test are used to build the tokenizer. Note that character-based RNN is a work-in-progress and not actuallly implemented as of now. """ # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed # punct/#s # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only # extract text from html, lowercase and strip (no punctuation removal) # Tokenization: Assign each word in the reviews an ID to be used in all reviews tokenizer = Tokenizer(min_df=10, max_features=100000, character=character_model) train_reviews_list = x_train.tolist() tokenizer.fit(train_reviews_list + x_test.tolist()) # Tokenize training reviws (so can use to fit RNN model on) train_reviews_tokenized = tokenizer.transform(train_reviews_list) # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py # RNN Network: # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation # (256) # -RNN layer (GRU) attempts to find pattern in sequence of words # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction if not character_model: layers = [ Embedding(size=256, n_features=tokenizer.n_features), # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: # Character-level RNN # Idea is to convert character tokenizations into one-hot encodings in which case # the embeddings layer is no longer needed train_reviews_tokenized = map( lambda r_indexes: pd.get_dummies( r_indexes, columns=range(tokenizer.n_features + 1)).values, train_reviews_tokenized) layers = [ # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=100, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] # RNN classifer uses Binary Cross-Entropy as the cost function classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) NUM_EPOCHS = 10 # 10 epochs may take 10+ hours to run depending on machine classifier.fit(train_reviews_tokenized, y_train.tolist(), n_epochs=NUM_EPOCHS) # Store model and tokenizer if character_model: passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9) else: passage.utils.save(classifier, PASSAGE_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)
from passage.models import RNN from passage.utils import save, load print("Loading data...") num_training = int((1.0 - 0.2) * len(xs)) X_train, y_train, X_test, y_test = xs[:num_training], ys[:num_training], xs[num_training:], ys[num_training:] num_feats = generator.max_id() + 1 layers = [ Embedding(size=8, n_features=num_feats), #LstmRecurrent(size=32), #NOTE - to use a deep RNN, you need all but the final layers with seq_ouput=True #GatedRecurrent(size=64, seq_output=True), GatedRecurrent(size=64, direction= 'backward' if REVERSE else 'forward'), Dense(size=1, activation='sigmoid'), ] #emd 128, gru 32/64 is good - 0.70006 causer print("Creating Model") model = RNN(layers=layers, cost='bce') def find_cutoff(y_test, predictions): scale = 100.0 min_val = round(min(predictions)) max_val = round(max(predictions)) diff = max_val - min_val inc = diff / scale