예제 #1
0
    def run_sent140(self):
        '''
        Trained on sent140, predict on emoji
        Report score on sent 140 too, just because it's intersting
        '''
        # model
        #self.model_sent140 = TorchRNNClassifier(self.sent140_train_glove_vocab, embedding=self.sent140_train_embedding, bidirectional=True)
        self.model_sent140 = TorchRNNClassifier(self.sent140_train_glove_vocab, embedding=self.sent140_train_embedding)
        
        # train
        self.model_sent140.fit(self.sent140_train_X, self.sent140_train_Y)

        # test on sent140
        #sent140_train_preds = self.model_sent140.predict(self.sent140_train_X)
        #sent140_dev_preds = self.model_sent140.predict(self.sent140_dev_X)

        # test on emoji
        emoji_train_preds = self.model_sent140.predict(self.emoji_train_X)
        emoji_dev_preds = self.model_sent140.predict(self.emoji_dev_X)
        if self.testing:
            emoji_test_preds = self.model_sent140.predict(self.emoji_test_X)
        else:
            emoji_test_preds = None

        #return (sent140_train_preds, sent140_dev_preds, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
        return (None, None, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
def test_predict_functions_honor_device(X_sequence, func):
    X_train, X_test, y_train, y_test, vocab = X_sequence
    mod = TorchRNNClassifier(vocab, max_iter=2)
    mod.fit(X_train, y_train)
    prediction_func = getattr(mod, func)
    with pytest.raises(RuntimeError):
        prediction_func(X_test, device="FAKE_DEVICE")
예제 #3
0
 def run_sent140_emojiless(self):
     '''
     Trained on sent140, fine-tuned on emojiless, predict on emoji
     Report score on sent 140 too, just because it's intersting
     '''
     # model
     #self.model_sent140_emojiless = TorchRNNClassifier(self.sent140_emojiless_train_glove_vocab, embedding=self.sent140_emojiless_train_embedding, bidirectional=True)
     self.model_sent140_emojiless = TorchRNNClassifier(self.sent140_emojiless_train_glove_vocab, embedding=self.sent140_emojiless_train_embedding)
     
     # train
     # combine features
     combined_train_X = self.sent140_train_X + self.emojiless_train_X
     combined_train_Y = self.sent140_train_Y + self.emojiless_train_Y
     self.model_sent140_emojiless.fit(combined_train_X, combined_train_Y)
     
     # test on sent140
     #sent140_train_preds = self.model_sent140_emojiless.predict(self.sent140_train_X)
     #sent140_dev_preds = self.model_sent140_emojiless.predict(self.sent140_dev_X)
     
     # test on emoji
     emoji_train_preds = self.model_sent140_emojiless.predict(self.emoji_train_X)
     emoji_dev_preds = self.model_sent140_emojiless.predict(self.emoji_dev_X)        
     if self.testing:
         emoji_test_preds = self.model_sent140_emojiless.predict(self.emoji_test_X)
     else:
         emoji_test_preds = None
     
     #return (sent140_train_preds, sent140_dev_preds, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
     return (None, None, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
예제 #4
0
def fit_elmo_rnn(X, y):
    mod = TorchRNNClassifier(
        vocab=[],
        max_iter=50,
        use_embedding=False)
    mod.fit(X, y)
    return mod
def test_model_graph_dimensions(X_sequence, mod_attr, graph_attr):
    X_train, X_test, y_train, y_test, vocab = X_sequence
    mod = TorchRNNClassifier(vocab, max_iter=1)
    mod.fit(X_train, y_train)
    mod_attr_val = getattr(mod, mod_attr)
    graph_attr_val = getattr(mod.model.rnn.rnn, graph_attr)
    assert mod_attr_val == graph_attr_val
예제 #6
0
def fit_hf_rnn(X, y):
    mod = TorchRNNClassifier(
        vocab=[],
        max_iter=50, 
        hidden_dim=50,
        use_embedding=False)  # Pass in the BERT hidden states directly!
    mod.fit(X, y)
    return mod
def test_predict_restores_device(X_sequence, func):
    X_train, X_test, y_train, y_test, vocab = X_sequence
    mod = TorchRNNClassifier(vocab, max_iter=2)
    mod.fit(X_train, y_train)
    current_device = mod.device
    assert current_device != torch.device("cpu:0")
    prediction_func = getattr(mod, func)
    prediction_func(X_test, device="cpu:0")
    assert mod.device == current_device
def test_parameter_setting(param, expected):
    vocab = []
    mod = TorchRNNClassifier(vocab)
    mod.set_params(**{param: expected})
    result = getattr(mod, param)
    if param == "embedding":
        assert np.array_equal(result, expected)
    else:
        assert result == expected
def test_build_dataset(cheese_disease_dataset, with_y, expected):
    vocab = cheese_disease_dataset['vocab']
    X = cheese_disease_dataset['X_train']
    y = cheese_disease_dataset['y_train']
    mod = TorchRNNClassifier(vocab)
    if with_y:
        dataset = mod.build_dataset(X, y)
    else:
        dataset = mod.build_dataset(X)
    result = next(iter(dataset))
    assert len(result) == expected
예제 #10
0
def test_embedding_update_control(X_sequence, freeze, outcome):
    X_train, X_test, y_train, y_test, vocab = X_sequence
    embed_dim = 5
    embedding = np.ones((len(vocab), embed_dim))
    mod = TorchRNNClassifier(vocab,
                             max_iter=10,
                             embedding=embedding,
                             freeze_embedding=freeze)
    mod.fit(X_train, y_train)
    graph_emb = mod.model.rnn.embedding.weight.detach().cpu().numpy()
    assert np.array_equal(embedding, graph_emb) == outcome
예제 #11
0
def test_pretrained_embedding(X_sequence):
    X_train, X_test, y_train, y_test, vocab = X_sequence
    embed_dim = 5
    embedding = np.ones((len(vocab), embed_dim))
    mod = TorchRNNClassifier(vocab,
                             max_iter=1,
                             embedding=embedding,
                             freeze_embedding=True)
    mod.fit(X_train, y_train)
    graph_emb = mod.model.rnn.embedding.weight.detach().cpu().numpy()
    assert np.array_equal(embedding, graph_emb)
예제 #12
0
def test_cheese_disease(cheese_disease_dataset):
    vocab = cheese_disease_dataset['vocab']
    X_train = cheese_disease_dataset['X_train']
    y_train = cheese_disease_dataset['y_train']
    mod = TorchRNNClassifier(vocab=vocab,
                             embed_dim=50,
                             hidden_dim=50,
                             max_iter=200)
    mod.fit(X_train, y_train)
    X_test = cheese_disease_dataset['X_train']
    y_test = cheese_disease_dataset['y_train']
    pred = mod.predict(X_test)
    acc = accuracy_score(y_test, pred)
    assert acc > 0.80
def test_torch_rnn_classifier_save_load(X_sequence):
    X_train, X_test, y_train, y_test, vocab = X_sequence
    mod = TorchRNNClassifier(vocab=vocab, max_iter=2)
    mod.fit(X_train, y_train)
    mod.predict(X_test)
    with tempfile.NamedTemporaryFile(mode='wb') as f:
        name = f.name
        mod.to_pickle(name)
        mod2 = TorchRNNClassifier.from_pickle(name)
        mod2.predict(X_test)
        mod2.fit(X_test, y_test)
def test_simple_example_params(X_sequence, param, expected):
    X_train, X_test, y_train, y_test, vocab = X_sequence
    mod = TorchRNNClassifier(vocab, **{param: expected})

    if param == "use_embedding" and expected == False:
        embedding = np.random.uniform(
            low=-1.0, high=1.0, size=(len(vocab), 60))
        X_train = [[embedding[vocab.index(w)] for w in ex] for ex in X_train]
        X_test = [[embedding[vocab.index(w)] for w in ex] for ex in X_test]

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)
    if not (param == "max_iter" and expected == 0):
        assert acc >= 0.60
예제 #15
0
    def system_0_original():

        # Data------------
        with open(wordentail_filename) as f:
            wordentail_data = json.load(f)

        print("Distribution of labels : \n{0}".format(
            pd.DataFrame(
                wordentail_data['word_disjoint']['train'])[1].value_counts()))

        # Model-----------
        X_glove = pd.DataFrame(GLOVE)
        X_glove['$UNK'] = 0
        X_glove = X_glove.T

        vocab = list(X_glove.index)
        embedding = X_glove.values
        net = TorchRNNClassifier(vocab=vocab,
                                 embedding=embedding,
                                 bidirectional=True)

        # Exp-------------
        result = nli.wordentail_experiment(
            train_data=wordentail_data['word_disjoint']['train'],
            assess_data=wordentail_data['word_disjoint']['dev'],
            model=net,
            vector_func=lambda x: np.array([x]),
            vector_combo_func=vec_concatenate)

        return result['macro-F1']
예제 #16
0
def fit_rnn_classifier(X, y):
    sst_glove_vocab = get_vocab(X, n_words=10)
    #     sst_glove_vocab = get_vocab(X, n_words=10000)
    mod = TorchRNNClassifier(
                             sst_glove_vocab,
                             eta=0.05,
                             embedding=None,
                             batch_size=1000,
                             embed_dim=50,
                             hidden_dim=50,
                             max_iter=5,
                             l2_strength=0.001,
                             bidirectional=True,
                             hidden_activation=nn.ReLU())
    mod.fit(X, y)
    return mod
def test_model(X_sequence):
    """Just makes sure that this code will run; it doesn't check that
    it is creating good models.
    """
    X_train, X_test, y_train, y_test, vocab = X_sequence
    mod = TorchRNNClassifier(vocab=vocab, max_iter=100)
    mod.fit(X_train, y_train)
    mod.predict(X_test)
    mod.predict_proba(X_test)
rnn_dev_predictions = rnn.predict(X_rnn_dev)

# In[26]:

print(classification_report(y_rnn_dev, rnn_dev_predictions))

# ### PyTorch implementation
#
# The included PyTorch implementation is much faster and more configurable.

# In[27]:

torch_rnn = TorchRNNClassifier(sst_train_vocab,
                               embed_dim=50,
                               hidden_dim=50,
                               max_iter=50,
                               eta=0.05)

# In[28]:

get_ipython().run_line_magic('time',
                             '_ = torch_rnn.fit(X_rnn_train, y_rnn_train)')

# In[29]:

torch_rnn_dev_predictions = torch_rnn.predict(X_rnn_dev)

# In[30]:

print(classification_report(y_rnn_dev, torch_rnn_dev_predictions))
예제 #19
0

def vec_concatenate(u, v):
    """Concatenate np.array instances `u` and `v` into a new np.array"""
    return np.concatenate((u, v))


# net = TorchShallowNeuralClassifier(hidden_dim=50, max_iter=100)

X_glove = pd.DataFrame(GLOVE)
X_glove['$UNK'] = 0
X_glove = X_glove.T

vocab = list(X_glove.index)
embedding = X_glove.values
net = TorchRNNClassifier(vocab=vocab, embedding=embedding)

word_disjoint_experiment = nli.wordentail_experiment(
    train_data=wordentail_data['word_disjoint']['train'],
    assess_data=wordentail_data['word_disjoint']['dev'],
    # model=GridSearchCV(net, {'hidden_dim': [25, 50, 100]}, cv=2, scoring='f1_macro'),
    model=net,
    vector_func=lambda x: np.array([x]),
    vector_combo_func=vec_concatenate)

print("macro-f1: {0}".format(word_disjoint_experiment['macro-F1']))

#
# # The outer keys are the  splits plus a list giving the vocabulary for the entire dataset:
#
# # In[ ]:
def test_cross_validation_sklearn(cheese_disease_dataset, model_class):
    vocab = cheese_disease_dataset['vocab']
    X = cheese_disease_dataset['X_train']
    y = cheese_disease_dataset['y_train']
    mod = TorchRNNClassifier(vocab, max_iter=5)
    xval = cross_validate(mod, X, y, cv=2)
def test_np_parameter_setting(param, expected):
    vocab = []
    mod = TorchRNNClassifier(vocab)
    mod.set_params(**{param: expected})
    result = getattr(mod, param)
    assert result == expected
예제 #22
0
    return [ex[-1] for ex in elmo_vecs]


# In[51]:


X_elmo_train = elmo_layer_reduce_top(X_elmo_train_layers)


# Now we can fit an RNN as usual:

# In[52]:


elmo_rnn = TorchRNNClassifier(
    vocab=[],
    max_iter=50,
    use_embedding=False) # Pass in the ELMo hidden states directly!


# In[53]:


get_ipython().run_line_magic('time', '_ = elmo_rnn.fit(X_elmo_train, y_elmo_train)')


# Evaluation proceeds in the usual way:

# In[54]:


X_elmo_dev = elmo_layer_reduce_top(X_elmo_dev_layers)
예제 #23
0
def fit_simple_chained_rnn(X, y):
    vocab = utils.get_vocab(X, n_words=10000)
    mod = TorchRNNClassifier(vocab, hidden_dim=50, max_iter=10)
    mod.fit(X, y)
    return mod
        write yourself, as in `torch_rnn_classifier`, or the outpiut of 
        `nn.Sequential`, as in `torch_shallow_neural_classifier`.
        
        """
        ##### YOUR CODE HERE
        return nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.Dropout(self.dropout_prob),
            self.hidden_activation,
            nn.Linear(self.hidden_dim, self.n_classes_))



from torch_rnn_classifier import TorchRNNClassifier
vocab = utils.get_vocab(X, n_words=10000)
mod = TorchRNNClassifier(vocab, hidden_dim=50, max_iter=10)




word_disjoint_experiment = nli.wordentail_experiment(
    train_data=wordentail_data['word_disjoint']['train'],
    assess_data=wordentail_data['word_disjoint']['dev'], 
    model=net, 
    vector_func=glove_vec,
    vector_combo_func=vec_concatenate)

word_experiment = nli.wordentail_experiment(
                train_data=wordentail_data[data]['train'],
                assess_data=wordentail_data[data]['dev'], 
                model=model, 
예제 #25
0
class RNN_Classifier:
    '''
    Modified torch rnn classifier wrapper class for initial fitting and then fine tuning of weights.
    '''

    def __init__(self, sent140_train_X_list, sent140_dev_X_list, sent140_train_Y, sent140_dev_Y, sent140_train_embedding, sent140_train_glove_vocab, emoji_train_X_list, emoji_dev_X_list, emoji_test_X_list, emoji_train_Y, emoji_dev_Y, emoji_test_Y, sent140_emoji_train_embedding, sent140_emoji_train_glove_vocab, emojiless_train_X_list, emojiless_dev_X_list, emojiless_test_X_list, emojiless_train_Y, emojiless_dev_Y, emojiless_test_Y, sent140_emojiless_train_embedding, sent140_emojiless_train_glove_vocab, testing):
        '''
        Pass in initial data for fitting to constructor. Later adding passing logisitic regression
        parameters into constructor.
        '''
        self.testing = testing

        self.sent140_train_X = sent140_train_X_list
        self.sent140_train_Y = sent140_train_Y
        self.sent140_dev_X = sent140_dev_X_list
        self.sent140_dev_Y = sent140_dev_Y

        self.emoji_train_X = emoji_train_X_list
        self.emoji_train_Y = emoji_train_Y
        self.emoji_dev_X = emoji_dev_X_list
        self.emoji_dev_Y = emoji_dev_Y
        if self.testing:
            self.emoji_test_X = emoji_test_X_list
            self.emoji_test_Y = emoji_test_Y
        
        self.emojiless_train_X = emojiless_train_X_list
        self.emojiless_train_Y = emojiless_train_Y
        self.emojiless_dev_X = emojiless_dev_X_list
        self.emojiless_dev_Y = emojiless_dev_Y
        if self.testing:
            self.emojiless_test_X = emojiless_test_X_list
            self.emojiless_test_Y = emojiless_test_Y

        # embeddings and vocabs
        self.sent140_train_embedding = sent140_train_embedding
        self.sent140_train_glove_vocab = sent140_train_glove_vocab
        self.sent140_emoji_train_embedding = sent140_emoji_train_embedding
        self.sent140_emoji_train_glove_vocab = sent140_emoji_train_glove_vocab
        self.sent140_emojiless_train_embedding = sent140_emojiless_train_embedding
        self.sent140_emojiless_train_glove_vocab = sent140_emojiless_train_glove_vocab

        # pass in model parameters for to constructor?
    

    def run_sent140(self):
        '''
        Trained on sent140, predict on emoji
        Report score on sent 140 too, just because it's intersting
        '''
        # model
        #self.model_sent140 = TorchRNNClassifier(self.sent140_train_glove_vocab, embedding=self.sent140_train_embedding, bidirectional=True)
        self.model_sent140 = TorchRNNClassifier(self.sent140_train_glove_vocab, embedding=self.sent140_train_embedding)
        
        # train
        self.model_sent140.fit(self.sent140_train_X, self.sent140_train_Y)

        # test on sent140
        #sent140_train_preds = self.model_sent140.predict(self.sent140_train_X)
        #sent140_dev_preds = self.model_sent140.predict(self.sent140_dev_X)

        # test on emoji
        emoji_train_preds = self.model_sent140.predict(self.emoji_train_X)
        emoji_dev_preds = self.model_sent140.predict(self.emoji_dev_X)
        if self.testing:
            emoji_test_preds = self.model_sent140.predict(self.emoji_test_X)
        else:
            emoji_test_preds = None

        #return (sent140_train_preds, sent140_dev_preds, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
        return (None, None, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
    

    def run_sent140_emojiless(self):
        '''
        Trained on sent140, fine-tuned on emojiless, predict on emoji
        Report score on sent 140 too, just because it's intersting
        '''
        # model
        #self.model_sent140_emojiless = TorchRNNClassifier(self.sent140_emojiless_train_glove_vocab, embedding=self.sent140_emojiless_train_embedding, bidirectional=True)
        self.model_sent140_emojiless = TorchRNNClassifier(self.sent140_emojiless_train_glove_vocab, embedding=self.sent140_emojiless_train_embedding)
        
        # train
        # combine features
        combined_train_X = self.sent140_train_X + self.emojiless_train_X
        combined_train_Y = self.sent140_train_Y + self.emojiless_train_Y
        self.model_sent140_emojiless.fit(combined_train_X, combined_train_Y)
        
        # test on sent140
        #sent140_train_preds = self.model_sent140_emojiless.predict(self.sent140_train_X)
        #sent140_dev_preds = self.model_sent140_emojiless.predict(self.sent140_dev_X)
        
        # test on emoji
        emoji_train_preds = self.model_sent140_emojiless.predict(self.emoji_train_X)
        emoji_dev_preds = self.model_sent140_emojiless.predict(self.emoji_dev_X)        
        if self.testing:
            emoji_test_preds = self.model_sent140_emojiless.predict(self.emoji_test_X)
        else:
            emoji_test_preds = None
        
        #return (sent140_train_preds, sent140_dev_preds, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
        return (None, None, emoji_train_preds, emoji_dev_preds, emoji_test_preds)

    
    def run_sent140_emoji(self):
        '''
        Trained on sent140, fine-tuned on emoji, predict on emoji
        Report score on sent 140 too, just because it's intersting
        '''
        # model
        #self.model_sent140_emoji = TorchRNNClassifier(self.sent140_emoji_train_glove_vocab, embedding=self.sent140_emoji_train_embedding, bidirectional=True)
        self.model_sent140_emoji = TorchRNNClassifier(self.sent140_emoji_train_glove_vocab, embedding=self.sent140_emoji_train_embedding)
        
        # train
        combined_train_X = self.sent140_train_X + self.emoji_train_X
        combined_train_Y = self.sent140_train_Y + self.emoji_train_Y
        self.model_sent140_emoji.fit(combined_train_X, combined_train_Y)

        # test on sent140
        #sent140_train_preds = self.model_sent140_emoji.predict(self.sent140_train_X)
        #sent140_dev_preds = self.model_sent140_emoji.predict(self.sent140_dev_X)
        
        # test on emoji
        emoji_train_preds = self.model_sent140_emoji.predict(self.emoji_train_X)
        emoji_dev_preds = self.model_sent140_emoji.predict(self.emoji_dev_X)
        if self.testing:
            emoji_test_preds = self.model_sent140_emoji.predict(self.emoji_test_X)
        else:
            emoji_test_preds = None
        
        #return (sent140_train_preds, sent140_dev_preds, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
        return (None, None, emoji_train_preds, emoji_dev_preds, emoji_test_preds)
예제 #26
0
                                                samp_percentage=1.0).read()]

anli_map = {'c': 'contradiction', 'e': 'entailment', 'n': 'neutral'}
dev_data = [((ex.context, ex.hypothesis), anli_map[ex.label])
            for ex in nli.ANLIDevReader(ANLI_HOME, rounds=(1, )).read()]

# net = RandomClassifier()

X_glove = pd.DataFrame(GLOVE)
X_glove['$UNK'] = 0
X_glove = X_glove.T
vocab = list(X_glove.index)
embedding = X_glove.values

net = TorchRNNClassifier(vocab,
                         embedding=embedding,
                         hidden_dim=50,
                         max_iter=10)


def vec_func(w):
    return w.split()


def vec_concatenate(u, v):
    """ hypothesis only baseline """
    return v


print("---------------------------------------------")

word_disjoint_experiment = nli.wordentail_experiment(