示例#1
0
def RNN(data_train, labels_train, data_test, labels_test, n_features):
        
    """
    Adapted from Passage's sentiment.py at
    https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py
    License: MIT
    """ 
    import numpy as np
    import pandas as pd
    
    from passage.models import RNN
    from passage.updates import Adadelta
    from passage.layers import Embedding, GatedRecurrent, Dense
    from passage.preprocessing import Tokenizer
    
    layers = [
        Embedding(size=128, n_features=n_features),
        GatedRecurrent(size=128, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
        Dense(size=1, activation='sigmoid', init='orthogonal')
    ]
    model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
    tokenizer = Tokenizer(min_df=10)
    X = tokenizer.fit_transform(data)
    model.fit(X, labels, n_epochs=10)
    predi = model.predit(data_test).flatten
    labels_predicted = np.ones(len(data_test))
    labels_predicted[predi<0.5] = 0
示例#2
0
def rnn(train_text, train_label):
    tokenizer = Tokenizer()
    train_tokens = tokenizer.fit_transform(train_text)
    layers = [
        Embedding(size=50, n_features=tokenizer.n_features),
        GatedRecurrent(size=128),
        Dense(size=1, activation='sigmoid')
    ]
    #    print "train_tokens=", train_tokens
    model = RNN(layers=layers, cost='BinaryCrossEntropy')
    model.fit(train_tokens, train_label)
    return model
示例#3
0
文件: Slide.py 项目: CS229proj/src
    def __fit_model(self, X, Y, num_features):

        layers = [
            Embedding(size=128, n_features=num_features),
            GatedRecurrent(size=512, p_drop=0.4),
            Dense(size=14, activation='softmax', p_drop=0.2)
        ]

        model = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
        model.fit(X, Y, n_epochs=10)

        return model
示例#4
0
def main(ptrain, ntrain, ptest, ntest, out, modeltype):
    assert modeltype in ["gated_recurrent", "lstm_recurrent"]

    print("Using the %s model ..." % modeltype)
    print("Loading data ...")
    trX, trY = load_data(ptrain, ntrain)
    teX, teY = load_data(ptest, ntest)

    tokenizer = Tokenizer(min_df=10, max_features=100000)
    trX = tokenizer.fit_transform(trX)
    teX = tokenizer.transform(teX)

    print("Training ...")
    if modeltype == "gated_recurrent":
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                           init='orthogonal', seq_output=False, p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]
    else:
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                          init='orthogonal', seq_output=False, p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]

    model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
    model.fit(trX, trY, n_epochs=10)

    # Predicting the probabilities of positive labels
    print("Predicting ...")
    pr_teX = model.predict(teX).flatten()

    predY = np.ones(len(teY))
    predY[pr_teX < 0.5] = -1

    with open(out, "w") as f:
        for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX):
            f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
示例#5
0
def train_RNN(tokenizer, tokens, labels):
	"""
	INPUT: Trained tokenizer class, label array
		- The arrays of the tokenized critic reviews and the corresponding labels
	Returns a trained Recurrent Neural Network class object
	"""
	layers = [
		Embedding(size=256, n_features=tokenizer.n_features),
		GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
		Dense(size=1, activation='sigmoid', init='orthogonal')
	]

	model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))

	path_snapshots = 'model_snapshots'

	print "Begin fitting RNN"

	model.fit(tokens, labels, n_epochs=12)

	return model
示例#6
0
	return [html.fromstring(text).text_content().lower().strip() for text in texts]
 
if __name__ == "__main__":
	tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') 
	trX = clean(tr_data['review'].values)
	trY = tr_data['sentiment'].values

	print("Training data loaded and cleaned.")

	tokenizer = Tokenizer(min_df=10, max_features=100000)
	trX = tokenizer.fit_transform(trX)

	print("Training data tokenized.")

	layers = [
		Embedding(size=256, n_features=tokenizer.n_features),
		GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
		Dense(size=1, activation='sigmoid', init='orthogonal')
	]

	model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
	model.fit(trX, trY, n_epochs=10)

	te_data = pd.read_csv('testData.tsv', delimiter='\t')
	ids = te_data['id'].values
	teX = clean(te_data['review'].values)
	teX = tokenizer.transform(teX)
	pr_teX = model.predict(teX).flatten()
	 
	pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])
示例#7
0
from passage.models import RNN
from passage.updates import NAG, Regularizer
from passage.layers import Generic, GatedRecurrent, Dense
from passage.utils import load, save

from load import load_mnist

trX, teX, trY, teY = load_mnist()

#Use generic layer - RNN processes a size 28 vector at a time scanning from left to right 
layers = [
	Generic(size=28),
	GatedRecurrent(size=512, p_drop=0.2),
	Dense(size=10, activation='softmax', p_drop=0.5)
]

#A bit of l2 helps with generalization, higher momentum helps convergence
updater = NAG(momentum=0.95, regularizer=Regularizer(l2=1e-4))

#Linear iterator for real valued data, cce cost for softmax
model = RNN(layers=layers, updater=updater, iterator='linear', cost='cce')
model.fit(trX, trY, n_epochs=20)

tr_preds = model.predict(trX[:len(teY)])
te_preds = model.predict(teX)

tr_acc = np.mean(trY[:len(teY)] == np.argmax(tr_preds, axis=1))
te_acc = np.mean(teY == np.argmax(te_preds, axis=1))

# Test accuracy should be between 98.9% and 99.3%
print 'train accuracy', tr_acc, 'test accuracy', te_acc
示例#8
0
import sys

# ---

# ---

print 'loading dataset'
d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY'])
d.load()

print 'generating labeled training set'
train_text,train_labels = d.getNextWordPredTrainset(10)
#for t,l in zip(train_text,train_labels):
#    print t,'->',l

tokenizer = Tokenizer()
train_tokens = tokenizer.fit_transform(train_text)
save(train_tokens, settings['FN_TRAINED_TOKENIZER'])

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=128),
    Dense(size=1, activation='sigmoid')
]

model = RNN(layers=layers, cost='BinaryCrossEntropy')
model.fit(train_tokens, train_labels)

save(model, settings['FN_MODEL_NEXTWORDPRED'])
示例#9
0
文件: gender.py 项目: ypscut/Passage
tokenizer = Tokenizer(min_df=10, max_features=50000)
print trX[1] # see a blog example
trX = tokenizer.fit_transform(trX)
teX = tokenizer.transform(teX)
print tokenizer.n_features

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False),
    Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
]

model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
for i in range(2):
    model.fit(trX, trY, n_epochs=1)
    tr_preds = model.predict(trX[:len(teY)])
    te_preds = model.predict(teX)

    tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
    te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

    print i, tr_acc, te_acc

save(model, 'save_test.pkl') # How to save

model = load('save_test.pkl') # How to load

tr_preds = model.predict(trX[:len(teY)])
te_preds = model.predict(teX)
示例#10
0
gru_sizes = [10, 20, 50, 100, 200, 1000]
epochs = [1, 3, 5, 7, 10]

for embedding_size, gru_size, num_epochs in product(embedding_sizes, gru_sizes, epochs):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        train_text, train_labels, test_size=0.1, random_state=0
    )

    layers = [
        Embedding(size=embedding_size, n_features=tokenizer.n_features),
        GatedRecurrent(size=gru_size),
        Dense(size=1, activation="sigmoid"),
    ]

    model = RNN(layers=layers, cost="BinaryCrossEntropy")
    model.fit(train_tokens, train_labels, n_epochs=int(num_epochs))

    modelfile_name = "stubborn_model.paramsearch.embedding{}.gru{}.epoch{}".format(embedding_size, gru_size, num_epochs)

    save(model, modelfile_name + ".pkl")
    pickle.dump(tokenizer, open(modelfile_name + "-tokenizer.pkl", "wb"))

    results = model.predict(tokenizer.transform(X_test))

    count = 0
    for r, g in zip(results, y_test):
        if int(r >= 0.5) == int(g):
            count += 1
    results = 1.0 * count / len(y_test)
    print(modelfile_name + "\t" + str(results))
示例#11
0
def train_model(modeltype, delta):

    assert modeltype in ["gated_recurrent", "lstm_recurrent"]
    print "Begin Training"

    df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t')

    X = clean(df_imdb_reviews['review'].values)
    y = df_imdb_reviews['sentiment'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
    print "Tokenize"

    tokenizer = Tokenizer(min_df=10, max_features=100000)
    X_train = tokenizer.fit_transform(X_train)
    X_train = [[float(x) for x in  y] for y in X_train]
    X_test = tokenizer.transform(X_test)
    X_test = [[float(x) for x in  y] for y in X_test]

    print "Number of featers: {}".format(tokenizer.n_features)

    print "Training model"

    if modeltype == "gated_recurrent":
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                           init='orthogonal', seq_output=True, p_drop=0.5),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]
    else:
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                          init='orthogonal', seq_output=True, p_drop=0.5),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]

    # bce is classification loss for binary classification and sigmoid output
    model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta))
    model.fit(X_train, y_train, n_epochs=20)

    with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f:
        vectorizer = pickle.dump(tokenizer, f)
    with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f:
        model = pickle.dump(model, f)

    try:
        y_pred_te = model.predict(X_test).flatten() >= 0.5
        y_pred_tr = model.predict(X_train).flatten() >= 0.5
        print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te))
        print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te))
        print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te))
        print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr))
        print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr))
        print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr))

    except:
        print "Unable to perform metrics"

    return tokenizer, model
示例#12
0
X_test = tokenizer.transform(newsgroups_test.data)
Y_train = newsgroups_train.target
Y_test = newsgroups_test.target

print tokenizer.n_features

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=256,
                   activation='tanh',
                   gate_activation='steeper_sigmoid',
                   init='orthogonal',
                   seq_output=False),
    Dense(size=1, activation='sigmoid',
          init='orthogonal')  # sigmoid for binary classification
]

model = RNN(
    layers=layers, cost='bce'
)  # bce is classification loss for binary classification and sigmoid output
for i in range(2):
    model.fit(X_train, Y_train, n_epochs=1)
    tr_preds = model.predict(X_train[:len(Y_test)])
    te_preds = model.predict(X_test)

    tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5)
    te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5)

    print i, tr_acc, te_acc  # dataset too small to fully utilize Passage

save(model, 'model.pkl')
示例#13
0
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.models import RNN
from passage.utils import save

tokenizer = Tokenizer(min_df=10, max_features=50000)
X_train = tokenizer.fit_transform(newsgroups_train.data)
X_test  = tokenizer.transform(newsgroups_test.data)
Y_train = newsgroups_train.target
Y_test  = newsgroups_test.target

print tokenizer.n_features

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid',
    			   init='orthogonal', seq_output=False),
    Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
]

model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
for i in range(2):
    model.fit(X_train, Y_train, n_epochs=1)
    tr_preds = model.predict(X_train[:len(Y_test)])
    te_preds = model.predict(X_test)

    tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5)
    te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5)

    print i, tr_acc, te_acc # dataset too small to fully utilize Passage

save(model, 'model.pkl')
num_feats = len(set(flatten(train_tokens)))


def get_labels(id):
    if id == 3:
        return [1, 0]
    else:
        return [0, 1]


seq_labels = map(lambda (l): map(get_labels, l), train_tokens)

layers = [
    Embedding(size=128, n_features=num_feats),
    GatedRecurrent(size=128, seq_output=True),
    Dense(size=num_feats, activation='softmax')
]

#iterator = SortedPadded(y_pad=True, y_dtype=intX)
#iterator = SortedPadded(y_dtype=intX)

#model = RNN(layers=layers, cost='seq_cce', iterator=iterator, Y=T.imatrix())
model = RNN(layers=layers, cost='seq_cce')
#model.fit(train_tokens, [1,0,1])
model.fit(train_tokens, train_tokens)

#model.predict(tokenizer.transform(["Frogs are awesome", "frogs are amphibious"]))
model.predict(train_tokens)
save(model, 'save_test.pkl')
model = load('save_test.pkl')
""" This model, although doing sequential prediction, predicts a tag per document not per word. """
示例#15
0
def train_and_save_passage_tokenizer_and_rnn_model(x_train,
                                                   y_train,
                                                   x_test,
                                                   character_model=False):
    """Train and save Passage tokenizer and Passage RNN model.

    x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed
    punct/#s
    x_train+x_test are used to build the tokenizer.

    Note that character-based RNN is a work-in-progress and not actuallly implemented as of now.
    """

    # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed
    # punct/#s

    # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only
    # extract text from html, lowercase and strip (no punctuation removal)

    # Tokenization: Assign each word in the reviews an ID to be used in all reviews
    tokenizer = Tokenizer(min_df=10,
                          max_features=100000,
                          character=character_model)

    train_reviews_list = x_train.tolist()
    tokenizer.fit(train_reviews_list + x_test.tolist())

    # Tokenize training reviws (so can use to fit RNN model on)
    train_reviews_tokenized = tokenizer.transform(train_reviews_list)

    # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based
    # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py

    # RNN Network:
    # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation
    # (256)
    # -RNN layer (GRU) attempts to find pattern in sequence of words
    # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction
    if not character_model:
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            # May replace with LstmRecurrent for LSTM layer
            GatedRecurrent(size=512,
                           activation='tanh',
                           gate_activation='steeper_sigmoid',
                           init='orthogonal',
                           seq_output=False,
                           p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]
    else:
        # Character-level RNN
        # Idea is to convert character tokenizations into one-hot encodings in which case
        # the embeddings layer is no longer needed
        train_reviews_tokenized = map(
            lambda r_indexes: pd.get_dummies(
                r_indexes, columns=range(tokenizer.n_features + 1)).values,
            train_reviews_tokenized)
        layers = [
            # May replace with LstmRecurrent for LSTM layer
            GatedRecurrent(size=100,
                           activation='tanh',
                           gate_activation='steeper_sigmoid',
                           init='orthogonal',
                           seq_output=False,
                           p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]

    # RNN classifer uses Binary Cross-Entropy as the cost function
    classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
    NUM_EPOCHS = 10
    # 10 epochs may take 10+ hours to run depending on machine
    classifier.fit(train_reviews_tokenized,
                   y_train.tolist(),
                   n_epochs=NUM_EPOCHS)

    # Store model and tokenizer
    if character_model:
        passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL)
        _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9)
    else:
        passage.utils.save(classifier, PASSAGE_RNN_MODEL)
        _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)
示例#16
0
layers = [
	Generic(size=data_dim),
	GatedRecurrent(size=32, p_drop=0.1, seq_output=True),
	GatedRecurrent(size=32, p_drop=0.1, seq_output=True),
	Dense(size=n_classes, activation='softmax', p_drop=0.5)
]

#A bit of l2 helps with generalization, higher momentum helps convergence
# updater = NAG(momentum=0.95, regularizer=Regularizer(l2=1e-4))
updater = Adam(lr=0.006)
padder = Padded(size=batch_size, x_dtype=floatX, y_lag=y_lag)

#Linear iterator for real valued data, cce cost for softmax
model = RNN(layers=layers, updater=updater, iterator=padder, cost='cpsce', Y=T.tensor3())
model.fit(trX, trY, n_epochs=100, batch_size=batch_size)

tr_preds = model.predict(trX[:len(teY)])
te_preds = model.predict(teX)

tr_acc = np.mean([np.all(y == np.argmax(yhat[y_lag:],axis=1)) for y, yhat in zip(trY[:len(teY)], tr_preds)])
te_acc = np.mean([np.all(y == np.argmax(yhat[y_lag:],axis=1)) for y, yhat in zip(teY, te_preds)])


print "Examples from Test: "
S = (["\n".join(map(str, (y, np.argmax(yhat[y_lag:],axis=1))))
      for y, yhat in zip(teY, te_preds)])
for s in S[:15]:
    print s, "\n"

# Test accuracy should be between 98.9% and 99.3%