def main(width=64, depth=2, vector_length=64, min_batch_size=1, max_batch_size=32, dropout=0.9, dropout_decay=1e-3, nb_epoch=20, L2=1e-6): cfg = dict(locals()) print(cfg) if cupy is not None: print("Using GPU") Model.ops = CupyOps() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) with Model.define_operators({ '**': clone, '>>': chain, '+': add, '|': concatenate }): lower_case = Embed(width, vector_length, 5000, column=0) prefix = Embed(width, vector_length, 5000, column=2) suffix = Embed(width, vector_length, 5000, column=3) model = (layerize(flatten_sequences) >> (lower_case + prefix + suffix) >> Residual(ExtractWindow(nW=1) >> Maxout(width))**depth >> Softmax(nr_tag)) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X, train_y, **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): y = model.ops.flatten(y) yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y)**2).sum() / y.shape[0] if loss > 0.: optimizer.set_loss(loss) backprop(yh - y, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() if epoch_train_acc / n_train >= 0.999: break with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
def create_attn_proxy(attn): """Return a proxy to the attention layer which will fetch the attention weights on each call, appending them to the list 'output'. """ output = [] def get_weights(Xs_lengths, drop=0.): Xs, lengths = Xs_lengths output.append(attn._get_attention(attn.Q, Xs, lengths)[0]) return attn.begin_update(Xs_lengths, drop=drop) return output, layerize(get_weights)
def doc2feats(cols=None): if cols is None: cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] def forward(docs, drop=0.0): feats = [] for doc in docs: feats.append(doc.to_array(cols)) return feats, None model = layerize(forward) model.cols = cols return model
def FeatureExtracter(lang, attrs=[LOWER, SHAPE, PREFIX, SUFFIX], tokenized=True): nlp = spacy.blank(lang) nlp.vocab.lex_attr_getters[PREFIX] = lambda string: string[:3] nlp.vocab.lex_attr_getters[SUFFIX] = lambda string: string[-3:] def forward(texts, drop=0.): if tokenized: docs = [Doc(nlp.vocab, words) for words in texts] else: docs = [nlp(text) for text in texts] features = [doc.to_array(attrs) for doc in docs] def backward(d_features, sgd=None): return d_features return features, backward return layerize(forward)
def get_col(idx): if idx < 0: raise IndexError(Errors.E066.format(value=idx)) def forward(X, drop=0.0): if isinstance(X, numpy.ndarray): ops = NumpyOps() else: ops = CupyOps() output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) def backward(y, sgd=None): dX = ops.allocate(X.shape) dX[:, idx] += y return dX return output, backward return layerize(forward)
def Siamese(layer, similarity): def begin_update(inputs, drop=0.): ops = layer.ops if drop != 0.: dropped = [] for in1, in2 in inputs: if in1.size > in2.size: mask = _get_mask(ops, in1.shape, drop) else: mask = _get_mask(ops, in2.shape, drop) in1 = in1 * mask[:in1.shape[0]] in2 = in2 * mask[:in2.shape[0]] dropped.append((in1, in2)) inputs = dropped input1, input2 = list(zip(*inputs)) vec1, bp_vec1 = layer.begin_update(input1, drop=0.) vec2, bp_vec2 = layer.begin_update(input2, drop=0.) output, bp_output = similarity.begin_update((vec1, vec2), drop=0.) def finish_update(d_output, sgd=None): d_vec1, d_vec2 = bp_output(d_output, sgd) # Remember that this is the same layer -- # Is this bad? Are we making bp_vec2 stale? d_input1 = bp_vec1(d_vec1, lambda *args, **kwargs: None) d_input2 = bp_vec2(d_vec2, sgd) return (d_input1, d_input2) return output, finish_update model = layerize(begin_update) model._layers.append(layer) model._layers.append(similarity) def on_data(self, X, y): input1, input2 = list(zip(*X)) for hook in layer.on_data_hooks: hook(layer, input1, y) model.on_data_hooks.append(on_data) return model
def get_col(idx): assert idx >= 0, idx def forward(X, drop=0.): assert idx >= 0, idx if isinstance(X, numpy.ndarray): ops = NumpyOps() else: ops = CupyOps() output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) def backward(y, sgd=None): assert idx >= 0, idx dX = ops.allocate(X.shape) dX[:, idx] += y return dX return output, backward return layerize(forward)
def Residual(layer): def forward(X, drop=0.0): y, bp_y = layer.begin_update(X, drop=drop) output = X + y def backward(d_output, sgd=None): return d_output + bp_y(d_output, sgd) return output, backward model = layerize(forward) model._layers.append(layer) def on_data(self, X, y=None): for layer in self._layers: for hook in layer.on_data_hooks: hook(layer, X, y) model.on_data_hooks.append(on_data) return model
def Siamese(layer, similarity): def begin_update(inputs, drop=0.): ops = layer.ops if drop not in (None, 0.): dropped = [] for in1, in2 in inputs: if in1.size > in2.size: mask = _get_mask(ops, in1.shape, drop) else: mask = _get_mask(ops, in2.shape, drop) in1 = in1 * mask[:in1.shape[0]] in2 = in2 * mask[:in2.shape[0]] dropped.append((in1, in2)) inputs = dropped input1, input2 = zip(*inputs) vec1, bp_vec1 = layer.begin_update(input1, drop=0.) vec2, bp_vec2 = layer.begin_update(input2, drop=0.) output, bp_output = similarity.begin_update((vec1, vec2), drop=0.) def finish_update(d_output, sgd=None): d_vec1, d_vec2 = bp_output(d_output, sgd) # Remember that this is the same layer -- # Is this bad? Are we making bp_vec2 stale? d_input1 = bp_vec1(d_vec1, lambda *args, **kwargs: None) d_input2 = bp_vec2(d_vec2, sgd) return (d_input1, d_input2) return output, finish_update model = layerize(begin_update) model._layers.append(layer) model._layers.append(similarity) def on_data(self, X, y): input1, input2 = zip(*X) for hook in layer.on_data_hooks: hook(layer, input1, y) model.on_data_hooks.append(on_data) return model
def getitem(i): def getitem_fwd(X, drop=0.0): return X[i], None return layerize(getitem_fwd)
def print_shape(prefix): def forward(X, drop=0.0): return X, lambda dX, **kwargs: dX return layerize(forward)
def asarray(ops, dtype): def forward(X, drop=0.0): return ops.asarray(X, dtype=dtype), None return layerize(forward)
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2): if depth == 0: return layerize(noop()) model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout) return with_square_sequences(PyTorchWrapperRNN(model))
def main( width=300, depth=4, vector_length=64, min_batch_size=1, max_batch_size=32, dropout=0.9, dropout_decay=1e-3, nb_epoch=20, L2=1e-6, device="cpu", ): cfg = dict(locals()) print(cfg, file=sys.stderr) if cupy is not None and device != "cpu": print("Using GPU", file=sys.stderr) Model.ops = CupyOps() Model.ops.device = device train_data, check_data, tag_map = twitter_ner() dev_words, dev_tags = zip(*check_data) nr_tag = len(tag_map) extracter = FeatureExtracter("en", attrs=[ORTH, LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({"**": clone, ">>": chain, "+": add, "|": concatenate}): glove = StaticVectors("en", width // 2, column=0) lower_case = HashEmbed(width, 500, column=1) + HashEmbed(width, 100, column=1) shape = HashEmbed(width // 2, 200, column=2) prefix = HashEmbed(width // 2, 100, column=3) suffix = HashEmbed(width // 2, 100, column=4) model = ( layerize(flatten_sequences) >> (lower_case | shape | prefix | suffix) >> BN(Maxout(width, pieces=3), nO=width) >> Residual(ExtractWindow(nW=1) >> BN(Maxout(width, pieces=3), nO=width)) ** depth >> Softmax(nr_tag) ) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X, train_y, **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): y = model.ops.flatten(y) yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop(yh - y, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() # if epoch_train_acc / n_train >= 0.999: # break with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y)), file=sys.stderr) print_dev_sentences(model, dev_words, dev_tags, dev_X, tag_map)