def build_model(weights=None, embedding_size=256, recurrent_gate_size=512, n_features=5, dropout=0.4): """ build_model Inputs: weights - Path to a weights file to load, or None if the model should be built from scratch embedding_size - Size of the embedding layer recurrent_gate_size - Size of the gated recurrent layer n_features - Number of features for the embedding layer dropout - Dropout value Returns: A model object ready for training (or evaluation if a previous model was loaded via `weights`) """ # vvvvv #Modify this if you want to change the structure of the network! # ^^^^^ model_layers = [ Embedding(size=embedding_size, n_features=n_features), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=1, activation='sigmoid', p_drop=dropout) ] model = RNN(layers=model_layers, cost='BinaryCrossEntropy', verbose=2, updater='Adam') if weights: #Just load the provided model instead, I guess? model = load(weights) return model
def rnn(train_text, train_label): tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=50, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] # print "train_tokens=", train_tokens model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_label) return model
def main(ptrain, ntrain, ptest, ntest, out, modeltype): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print("Using the %s model ..." % modeltype) print("Loading data ...") trX, trY = load_data(ptrain, ntrain) teX, teY = load_data(ptest, ntest) tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print("Training ...") if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) # Predicting the probabilities of positive labels print("Predicting ...") pr_teX = model.predict(teX).flatten() predY = np.ones(len(teY)) predY[pr_teX < 0.5] = -1 with open(out, "w") as f: for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX): f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
def train_RNN(tokenizer, tokens, labels): """ INPUT: Trained tokenizer class, label array - The arrays of the tokenized critic reviews and the corresponding labels Returns a trained Recurrent Neural Network class object """ layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) path_snapshots = 'model_snapshots' print "Begin fitting RNN" model.fit(tokens, labels, n_epochs=12) return model
def build_model(weights=None, embedding_size=128, recurrent_gate_size=256, n_features=5, dropout=0.1): """ build_model Inputs: weights - Path to a weights file to load, or None if the model should be built from scratch embedding_size - Size of the embedding layer recurrent_gate_size - Size of the gated recurrent layer n_features - Number of features for the embedding layer dropout - Dropout value Returns: A model object ready for training (or evaluation if a previous model was loaded via `weights`) """ # vvvvv #Modify this if you want to change the structure of the network! # ^^^^^ model_layers = [ Embedding(size=embedding_size, n_features=n_features), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=1, activation='sigmoid', p_drop=dropout) ] args = { 'layers': model_layers, 'cost': 'BinaryCrossEntropy', 'verbose': 2, 'updater': Adadelta(lr=0.5), 'embedding_size': embedding_size } model = RNN(**args) if weights: #Just load the provided model instead, I guess? print "Loading previously created weights file: ", weights model = load(weights) return model
import numpy as np from passage.models import RNN from passage.updates import NAG, Regularizer from passage.layers import Generic, GatedRecurrent, Dense from passage.utils import load, save from load import load_mnist trX, teX, trY, teY = load_mnist() #Use generic layer - RNN processes a size 28 vector at a time scanning from left to right layers = [ Generic(size=28), GatedRecurrent(size=512, p_drop=0.2), Dense(size=10, activation='softmax', p_drop=0.5) ] #A bit of l2 helps with generalization, higher momentum helps convergence updater = NAG(momentum=0.95, regularizer=Regularizer(l2=1e-4)) #Linear iterator for real valued data, cce cost for softmax model = RNN(layers=layers, updater=updater, iterator='linear', cost='cce') model.fit(trX, trY, n_epochs=20) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = np.mean(trY[:len(teY)] == np.argmax(tr_preds, axis=1)) te_acc = np.mean(teY == np.argmax(te_preds, axis=1))
from passage.utils import save, load print("Loading data...") num_training = int((1.0 - 0.2) * len(xs)) X_train, y_train, X_test, y_test = xs[:num_training], ys[:num_training], xs[num_training:], ys[num_training:] num_feats = generator.max_id() + 1 layers = [ Embedding(size=128, n_features=num_feats), #LstmRecurrent(size=32), #NOTE - to use a deep RNN, you need all but the final layers with seq_ouput=True #GatedRecurrent(size=128, seq_output=True), #GatedRecurrent(size=256, direction= 'backward' if REVERSE else 'forward'), GatedRecurrent(size=128, seq_output=True), GatedRecurrent(size=128), #Dense(size=64, activation='sigmoid'), Dense(size=len(lst_freq_tags), activation='sigmoid'), ] #emd 128, gru 32/64 is good - 0.70006 causer print("Creating Model") model = RNN(layers=layers, cost='bce') def find_cutoff(y_test, predictions): scale = 20.0 min_val = round(min(predictions)) max_val = round(max(predictions))
import sys # --- # --- print 'loading dataset' d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY']) d.load() print 'generating labeled training set' train_text,train_labels = d.getNextWordPredTrainset(10) #for t,l in zip(train_text,train_labels): # print t,'->',l tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) save(train_tokens, settings['FN_TRAINED_TOKENIZER']) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) save(model, settings['FN_MODEL_NEXTWORDPRED'])
from passage.models import RNN from passage.utils import load, save from load import load_gender_data trX, teX, trY, teY = load_gender_data(ntrain=10000) # Can increase up to 250K or so tokenizer = Tokenizer(min_df=10, max_features=50000) print trX[1] # see a blog example trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(trX, trY, n_epochs=1) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5) te_acc = metrics.accuracy_score(teY, te_preds > 0.5) print i, tr_acc, te_acc save(model, 'save_test.pkl') # How to save
tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') trX = clean(tr_data['review'].values) trY = tr_data['sentiment'].values print("Training data loaded and cleaned.") tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) print("Training data tokenized.") layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) te_data = pd.read_csv('testData.tsv', delimiter='\t') ids = te_data['id'].values teX = clean(te_data['review'].values) teX = tokenizer.transform(teX) pr_teX = model.predict(teX).flatten() pd.DataFrame(np.asarray([ids,
from passage.utils import save, load print("Loading data...") num_training = int((1.0 - TEST_SPLIT) * len(xs)) X_train, y_train, X_test, y_test = xs[:num_training], ys[:num_training], xs[ num_training:], ys[num_training:] num_feats = generator.max_id() + 1 layers = [ Embedding(size=64, n_features=num_feats), #LstmRecurrent(size=32), #NOTE - to use a deep RNN, you need all but the final layers with seq_ouput=True #GatedRecurrent(size=64, seq_output=True), GatedRecurrent(size=64, direction='backward' if REVERSE else 'forward'), #LstmRecurrent(size=128), Dense(size=1, activation='sigmoid'), ] #emd 64, gru 64 is good - 0.70833 causer (0 prev sents) print("Creating Model") model = RNN(layers=layers, cost='bce') def find_cutoff(y_test, predictions): scale = 100.0 min_val = round(min(predictions)) max_val = round(max(predictions))
def train(X, y): from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM, GRU, SimpleRNN from keras.layers.core import Dense from keras.models import Sequential from keras.layers.core import Dropout from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from math import e vocab = 10000 tokenizer = Tokenizer(nb_words=vocab) tokenizer.fit_on_texts(X) X = tokenizer.texts_to_sequences(X) """ index_word = {v: k for k, v in tokenizer.word_index.items()} for i in range(1, 10001): print str(i) + "," + index_word[i] return """ maxlen = 50 X1 = [] y1 = [] for thing, target in zip(X, y): if len(thing) != 0: X1.append(thing) y1.append(target) X = X1 y = y1 KERAS = False if KERAS: X = pad_sequences(X, maxlen=maxlen) from random import shuffle xy = zip(X, y) shuffle(xy) X_s, y_s = zip(*xy) X_train, y_train, X_test, y_test = X_s[:-1000], y_s[:-1000], X_s[ -1000:], y_s[-1000:] embedding_size = 256 dropout = .3 batch_size = 256 recurrent_gate_size = 512 """ model = Sequential() model.add(Embedding(vocab, embedding_size, mask_zero=True)) model.add(Dropout(dropout)) model.add(LSTM(recurrent_gate_size)) model.add(Dropout(dropout)) model.add(Dense(1)) print "building model..." model.compile(loss="msle", optimizer="rmsprop") print "fitting model" #model.load_weights("mymodel") model.fit(np.asarray(X_train), np.asarray(y_train), nb_epoch=30, verbose=1, batch_size=batch_size, validation_data=(np.asarray(X_test), np.asarray(y_test))) model.save_weights("mymodel") """ from passage.preprocessing import Tokenizer, LenFilter from passage.layers import Embedding, GatedRecurrent, Dense, OneHot, LstmRecurrent from passage.models import RNN from passage.utils import save, load from passage.iterators import Padded layers = [ # OneHot(n_features=5), Embedding(size=embedding_size, n_features=vocab), # GatedRecurrent(size=recurrent_gate_size, seq_output=True, p_drop=dropout), # LstmRecurrent(size=recurrent_gate_size, p_drop=dropout), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=8, activation='softmax', p_drop=dropout) ] print >> sys.stderr, "learning model" model_iterator = Padded() model = load("mymodel.final.pkl") #model = RNN(layers=layers, cost='CategoricalCrossEntropy', verbose=2, updater="Adam") filter = LenFilter(max_len=maxlen) model.fit(np.asarray(X_train), np.asarray(y_train), batch_size=batch_size, n_epochs=1000, path="mymodel.pkl", snapshot_freq=49, len_filter=filter) save(model, "mymodel.final.pkl") # print "test cost" # print model._cost(np.asarray(X_test), np.asarray(y_test)) print "test accuracy" passage_batch_predict(np.asarray(X_train), np.asarray(y_train), model) exit = False print "enter a sentence" while not exit: text = raw_input() if text == "exit": break else: tokens = tokenizer.texts_to_sequences([text]) if len(tokens) == 0: print "Sentence too strange, try again" continue if KERAS: tokens = pad_sequences(tokens, maxlen=maxlen) prediction = np.argmax(model.predict(tokens)[0]) try: print e**(prediction - 2) except Exception: pass
def train_model(modeltype, delta): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print "Begin Training" df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t') X = clean(df_imdb_reviews['review'].values) y = df_imdb_reviews['sentiment'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) print "Tokenize" tokenizer = Tokenizer(min_df=10, max_features=100000) X_train = tokenizer.fit_transform(X_train) X_train = [[float(x) for x in y] for y in X_train] X_test = tokenizer.transform(X_test) X_test = [[float(x) for x in y] for y in X_test] print "Number of featers: {}".format(tokenizer.n_features) print "Training model" if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] # bce is classification loss for binary classification and sigmoid output model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta)) model.fit(X_train, y_train, n_epochs=20) with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: vectorizer = pickle.dump(tokenizer, f) with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: model = pickle.dump(model, f) try: y_pred_te = model.predict(X_test).flatten() >= 0.5 y_pred_tr = model.predict(X_train).flatten() >= 0.5 print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te)) print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te)) print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te)) print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr)) print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr)) print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr)) except: print "Unable to perform metrics" return tokenizer, model
################# # training data have to be lemmatized using Morphodita !!!!!!!!! ################# trX, teX, trY, teY = load_data(ntrain=9000, ntest=1000) print len(trX), len(trY), len(teX), len(teY) tokenizer = Tokenizer(min_df=10, max_features=50000) trX = tokenizer.fit_transform(trX) pickle.dump(tokenizer, open('tokenizer.pkl', 'wb')) print "number of tokens:" + str(len(trX)) teX = tokenizer.transform(teX) print "number of feathures:" + str(tokenizer.n_features) layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=725), Dense(size=10, activation='softmax') ] model = RNN(layers=layers, cost='cce') model.fit(trX, trY, n_epochs=10) save(model, 'modelEcho.pkl') tr_preds = model.predict(trX) te_preds = model.predict(teX) data = pd.DataFrame(trY) data.to_csv('data/trY.vec') data = pd.DataFrame(tr_preds) data.to_csv('data/tr_preds.vec')
def train_and_save_passage_tokenizer_and_rnn_model(x_train, y_train, x_test, character_model=False): """Train and save Passage tokenizer and Passage RNN model. x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed punct/#s x_train+x_test are used to build the tokenizer. Note that character-based RNN is a work-in-progress and not actuallly implemented as of now. """ # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed # punct/#s # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only # extract text from html, lowercase and strip (no punctuation removal) # Tokenization: Assign each word in the reviews an ID to be used in all reviews tokenizer = Tokenizer(min_df=10, max_features=100000, character=character_model) train_reviews_list = x_train.tolist() tokenizer.fit(train_reviews_list + x_test.tolist()) # Tokenize training reviws (so can use to fit RNN model on) train_reviews_tokenized = tokenizer.transform(train_reviews_list) # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py # RNN Network: # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation # (256) # -RNN layer (GRU) attempts to find pattern in sequence of words # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction if not character_model: layers = [ Embedding(size=256, n_features=tokenizer.n_features), # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: # Character-level RNN # Idea is to convert character tokenizations into one-hot encodings in which case # the embeddings layer is no longer needed train_reviews_tokenized = map( lambda r_indexes: pd.get_dummies( r_indexes, columns=range(tokenizer.n_features + 1)).values, train_reviews_tokenized) layers = [ # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=100, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] # RNN classifer uses Binary Cross-Entropy as the cost function classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) NUM_EPOCHS = 10 # 10 epochs may take 10+ hours to run depending on machine classifier.fit(train_reviews_tokenized, y_train.tolist(), n_epochs=NUM_EPOCHS) # Store model and tokenizer if character_model: passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9) else: passage.utils.save(classifier, PASSAGE_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') x = x+m.mean()*0 embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" wstd = 0.02 #vaguely normalize x = x / 3.0 - .5 #gloveMapping = Linear( #input_dim = embedding_size, #output_dim = 128, #weights_init = Orthogonal(), #biases_init = Constant(0.0), #name="gloveMapping" #) #gloveMapping.initialize() #o = gloveMapping.apply(x) #o = Rectifier(name="gloveRec").apply(o) rnn_in = x.dimshuffle(1, 0, 2) class Stub(object): def output(self, dropout_active=False): return rnn_in l_in = Stub() l_in.size = 300 layer = GatedRecurrentPassage( size=300, gate_activation='sigmoid') layer.connect(l_in) from blocks.roles import add_role, WEIGHT, INITIAL_STATE print layer.params [add_role(l, WEIGHT) for l in layer.params] rnn_out = layer.output() o = rnn_out #o = rnn_out[-1, :, :] #o = rnn_out[:, -1, :] #o = rnn_out.mean(axis=1) #print rnn_last_out.eval({ #x: np.ones((3, 101, 300), dtype=theano.config.floatX), #m: np.ones((3, 101), dtype=theano.config.floatX)}) #raw_input() #o = rnn_out.mean(axis=1) score_layer = Linear( input_dim = 300, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print rnn_in.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #}) #print rnn_out.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters print params print "Len params", len(params) algorithm = GradientDescent( cost = cg.outputs[0], params=params, step_rule = CompositeRule([ StepClipping(threshold=4), AdaM(), #NAG(lr=0.1, momentum=0.9), #AdaDelta(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } #batch_size = 16 batch_size = 32 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, #mask_sources=('features',) mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start()