def main(loc, width=64, depth=2, batch_size=128, dropout=0.5, dropout_decay=1e-5, nb_epoch=20): print("Load spaCy") nlp = spacy.load('en', parser=False, entity=False, matcher=False, tagger=False) print("Construct model") Model.ops = CupyOps() with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}): mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3) sent2vec = (get_word_ids >> flatten_add_lengths >> with_getitem( 0, SpacyVectors(nlp, width) >> mwe_encode**depth) >> Pooling( mean_pool, max_pool)) model = (((Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) >> Maxout(width, width * 4) >> Maxout( width, width)**depth >> Softmax(2, width)) print("Read and parse quora data") rows = read_quora_tsv_data(pathlib.Path(loc)) train, dev = partition(rows, 0.9) train_X, train_y = create_data(model.ops, nlp, train) dev_X, dev_y = create_data(model.ops, nlp, dev) print("Train") with model.begin_training(train_X[:20000], train_y[:20000]) as (trainer, optimizer): trainer.batch_size = batch_size trainer.nb_epoch = nb_epoch trainer.dropout = dropout trainer.dropout_decay = dropout_decay epoch_times = [timer()] epoch_loss = [0.] n_train_words = sum(len(d0) + len(d1) for d0, d1 in train_X) n_dev_words = sum(len(d0) + len(d1) for d0, d1 in dev_X) def track_progress(): stats = get_stats(model, optimizer.averages, dev_X, dev_y, epoch_loss[-1], epoch_times[-1], n_train_words, n_dev_words) stats.append(trainer.dropout) stats = tuple(stats) print( len(epoch_loss), "%.3f loss, %.3f (%.3f) acc, %d/%d=%d wps train, %d/%.3f=%d wps run. d.o.=%.3f" % stats) epoch_times.append(timer()) epoch_loss.append(0.) trainer.each_epoch.append(track_progress) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop(yh - y, optimizer)
def main(width=100, depth=4, vector_length=64, min_batch_size=1, max_batch_size=32, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6): cfg = dict(locals()) print(cfg) if cupy is not None: print("Using GPU") Model.ops = CupyOps() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({ '**': clone, '>>': chain, '+': add, '|': concatenate }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = (with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3) >> Residual(ExtractWindow(nW=1) >> Maxout(width, pieces=3))**depth >> Softmax(nr_tag), pad=depth)) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open('/tmp/model.pickle', 'wb') as file_: pickle.dump(model, file_)
def main(width=64, depth=2, vector_length=64, min_batch_size=1, max_batch_size=32, dropout=0.9, dropout_decay=1e-3, nb_epoch=20, L2=1e-6): cfg = dict(locals()) print(cfg) if cupy is not None: print("Using GPU") Model.ops = CupyOps() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) with Model.define_operators({ '**': clone, '>>': chain, '+': add, '|': concatenate }): lower_case = Embed(width, vector_length, 5000, column=0) prefix = Embed(width, vector_length, 5000, column=2) suffix = Embed(width, vector_length, 5000, column=3) model = (layerize(flatten_sequences) >> (lower_case + prefix + suffix) >> Residual(ExtractWindow(nW=1) >> Maxout(width))**depth >> Softmax(nr_tag)) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X, train_y, **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): y = model.ops.flatten(y) yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y)**2).sum() / y.shape[0] if loss > 0.: optimizer.set_loss(loss) backprop(yh - y, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() if epoch_train_acc / n_train >= 0.999: break with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
def main(width=300, depth=4, vector_length=64, min_batch_size=1, max_batch_size=32, dropout=0.9, dropout_decay=1e-3, nb_epoch=20, L2=1e-6, device="cpu"): cfg = dict(locals()) print(cfg, file=sys.stderr) if cupy is not None and device != 'cpu': print("Using GPU", file=sys.stderr) Model.ops = CupyOps() Model.ops.device = device train_data, check_data, tag_map = twitter_ner() dev_words, dev_tags = list(zip(*check_data)) nr_tag = len(tag_map) extracter = FeatureExtracter('en', attrs=[ORTH, LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({'**': clone, '>>': chain, '+': add, '|': concatenate}): glove = StaticVectors('en', width//2, column=0) lower_case = (HashEmbed(width, 500, column=1) + HashEmbed(width, 100, column=1)) shape = HashEmbed(width//2, 200, column=2) prefix = HashEmbed(width//2, 100, column=3) suffix = HashEmbed(width//2, 100, column=4) model = ( layerize(flatten_sequences) >> (lower_case | shape | prefix | suffix) >> BN(Maxout(width, pieces=3), nO=width) >> Residual(ExtractWindow(nW=1) >> BN(Maxout(width, pieces=3), nO=width)) ** depth >> Softmax(nr_tag)) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X, train_y, **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): y = model.ops.flatten(y) yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop(yh - y, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() #if epoch_train_acc / n_train >= 0.999: # break with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y)), file=sys.stderr) print_dev_sentences(model, dev_words, dev_tags, dev_X, tag_map)
def main(use_gpu=False, nb_epoch=100): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb(limit=2000) print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2)) test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2)) nlp = spacy.load('en_vectors_web_lg') nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) train_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))] test_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))] dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") n_sent = sum([len(list(sents)) for sents in train_X]) print("%d sentences" % n_sent) model = build_model(2, width=128, conv_depth=2, depth=2, train_X=train_X, train_y=train_y) with model.begin_training(train_X[:100], train_y[:100]) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], epoch_var[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) epoch_var.append(0.) trainer.each_epoch.append(report_progress) batch_sizes = compounding(64, 64, 1.01) trainer.dropout = 0.3 trainer.batch_size = int(next(batch_sizes)) trainer.dropout_decay = 0.0 trainer.nb_epoch = nb_epoch #optimizer.alpha = 0.1 #optimizer.max_grad_norm = 10.0 #optimizer.b1 = 0.0 #optimizer.b2 = 0.0 epoch_var = [0.] for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) losses = ((yh-y)**2.).sum(axis=1) / y.shape[0] epoch_var[-1] += losses.var() loss = losses.mean() backprop((yh-y)/yh.shape[0], optimizer) epoch_loss[-1] += loss trainer.batch_size = int(next(batch_sizes)) with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y))
def use_gpu(gpu_id): try: import cupy.cuda.device except ImportError: return None from thinc.neural.ops import CupyOps device = cupy.cuda.device.Device(gpu_id) device.use() Model.ops = CupyOps() Model.Ops = CupyOps return device
def forward(X, drop=0.0): if isinstance(X, numpy.ndarray): ops = NumpyOps() else: ops = CupyOps() output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) def backward(y, sgd=None): dX = ops.allocate(X.shape) dX[:, idx] += y return dX return output, backward
def main(use_gpu=False, nb_epoch=50): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = Language() dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") train_X = [nlp.make_doc(x) for x in train_X] dev_X = [nlp.make_doc(x) for x in dev_X] model = build_model(2, 1) print("Begin training") with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.0 trainer.batch_size = 128 trainer.dropout_decay = 0.0 for X, y in trainer.iterate(train_X[:1000], train_y[:1000]): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y)**2.).sum() / y.shape[0] backprop((yh - y) / y.shape[0], optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y)) with open('out.pickle', 'wb') as file_: pickle.dump(model, file_, -1)
def main(length=1000, nO=32, nI=32): if CupyOps.xp is not None: print("Use GPU") Model.ops = CupyOps() Model.Ops = CupyOps torch.set_default_tensor_type("torch.cuda.FloatTensor") pt_model = nn.Linear(nI, nO) optimizer = torch.optim.Adam(pt_model.parameters()) # noqa: F841 model = PyTorchWrapper(pt_model) X = Model.ops.xp.ones((length, nI), dtype="f") y = 1.0 / X for i in range(10): yh, get_dX = model.begin_update(X) dY = (yh - y) / len(y) dX = get_dX(dY) # noqa: F841
def main(length=1000, nO=32, nI=32): ''' Driver function ''' if CupyOps.xp != None: print("Use GPU") Model.ops = CupyOps() Model.Ops = CupyOps torch.set_default_tensor_type('torch.cuda.FloatTensor') else: print("GPU not available. Running on CPU.") pt_model = nn.Linear(nI, nO) optimizer = torch.optim.Adam(pt_model.parameters()) # noqa: F841 model = PyTorchWrapper(pt_model) X = Model.ops.xp.ones((length, nI), dtype="f") y = 1.0 / X for i in range(10): yh, get_dX = model.begin_update(X) dY = (yh - y) / len(y) dX = get_dX(dY) # noqa: F841
def main(depth=2, width=512, nb_epoch=30): if CupyOps.xp != None: Model.ops = CupyOps() Model.Ops = CupyOps # Configuration here isn't especially good. But, for demo.. with Model.define_operators({'**': clone, '>>': chain}): model = ReLu(width) >> ReLu(width) >> Softmax() train_data, dev_data, _ = datasets.mnist() train_X, train_y = model.ops.unzip(train_data) dev_X, dev_y = model.ops.unzip(dev_data) dev_y = to_categorical(dev_y) with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.3 trainer.batch_size = 128 trainer.dropout_decay = 0.0 train_X = model.ops.asarray(train_X, dtype='float32') y_onehot = to_categorical(train_y) for X, y in trainer.iterate(train_X, y_onehot): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y)**2.).sum() / y.shape[0] backprop(yh - y, optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y)) with open('out.pickle', 'wb') as file_: pickle.dump(model, file_, -1)
def main(width=128, depth=4, vector_length=64, max_batch_size=32, dropout=0.9, drop_decay=1e-4, nb_epoch=20, L2=1e-5): cfg = dict(locals()) Model.ops = CupyOps() train_data, check_data, nr_tag = ancora_pos_tags() with Model.define_operators({'**': clone, '>>': chain}): model = (layerize(flatten_sequences) >> Embed(width, vector_length) >> (ExtractWindow(nW=1) >> Maxout(width, pieces=3))**depth >> Softmax(nr_tag)) train_X, train_y = preprocess(model.ops, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X, train_y, **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = 1 batch_size = 1. for X, y in trainer.iterate(train_X, train_y): y = model.ops.flatten(y) yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop(yh - y, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
def main(dataset='quora', width=50, depth=2, min_batch_size=1, max_batch_size=512, dropout=0.2, dropout_decay=0.0, pooling="mean+max", nb_epoch=5, pieces=3, L2=0.0, use_gpu=False, out_loc=None, quiet=False, job_id=None, ws_api_url=None, rest_api_url=None): global CTX if job_id is not None: CTX = neptune.Context() width = CTX.params.width L2 = CTX.params.L2 nb_epoch = CTX.params.nb_epoch depth = CTX.params.depth max_batch_size = CTX.params.max_batch_size cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == 'mean+max': pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy('en') if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) Model.lsuv = True #Model.ops = CupyOps() with Model.define_operators({ '>>': chain, '**': clone, '|': concatenate, '+': add }): mwe_encode = ExtractWindow(nW=1) >> BN( Maxout(width, drop_factor=0.0, pieces=pieces)) sent2vec = ( # List[spacy.token.Doc]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem( 0, #(StaticVectors('en', width) HashEmbed(width, 3000) #+ HashEmbed(width, 3000)) #>> Residual(mwe_encode ** 2) ) # : word_ids{T} >> Pooling(mean_pool, max_pool) #>> Residual(BN(Maxout(width*2, pieces=pieces), nO=width*2)**2) >> Maxout(width * 2, pieces=pieces, drop_factor=0.0) >> logistic) model = Siamese(sent2vec, CauchySimilarity(width * 2)) print("Read and parse data: %s" % dataset) if dataset == 'quora': train, dev = datasets.quora_questions() elif dataset == 'snli': train, dev = datasets.snli() elif dataset == 'stackxc': train, dev = datasets.stack_exchange() elif dataset in ('quora+snli', 'snli+quora'): train, dev = datasets.quora_questions() train2, dev2 = datasets.snli() train.extend(train2) dev.extend(dev2) else: raise ValueError("Unknown dataset: %s" % dataset) get_ids = get_word_ids(Model.ops) train_X, train_y = preprocess(model.ops, nlp, train, get_ids) dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids) with model.begin_training(train_X[:10000], train_y[:10000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate_logloss(dev_X, dev_y)) print("Train") global epoch_train_acc n_iter = 0 for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) assert yh.shape == y.shape, (yh.shape, y.shape) assert (yh >= 0.).all(), yh train_acc = ((yh >= 0.5) == (y >= 0.5)).sum() loss = model.ops.xp.abs(yh - y).mean() epoch_train_acc += train_acc backprop(yh - y, optimizer) n_iter += 1 # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print('Saving to', out_loc) with out_loc.open('wb') as file_: pickle.dump(model, file_, -1)
def main(use_gpu=False): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = spacy.load('en') nlp.vocab.lex_attr_getters[PREFIX] = lambda string: string[:3] for word in nlp.vocab: word.prefix_ = word.orth_[:3] dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] #train_X = train_X[:1000] #train_y = train_y[:1000] print("Parse data") train_X = list(nlp.pipe(train_X)) dev_X = list(nlp.pipe(dev_X)) n_sent = sum([len(list(doc.sents)) for doc in train_X]) print("%d sentences" % n_sent) hpsearch = BestFirstFinder(nonlin=[SELU], width=[64], depth=[2], conv_depth=[2], batch_size=[128], learn_rate=[0.001], L2=[1e-6], beta1=[0.9], beta2=[0.999], dropout=[0.2]) for hp in hpsearch.configs: for _ in range(3): model = build_model(2, train_X=train_X, train_y=train_y, **hp) with model.begin_training(train_X[:100], train_y[:100]) as (_, sgd): pass _, (model_data, train_acc, dev_acc) = train_epoch(model, sgd, hp, train_X, train_y, dev_X, dev_y, device_id=-1 if not use_gpu else 0) print('0', dev_acc * 100, train_acc * 100, hp) hpsearch.enqueue(model_data, train_acc, dev_acc) hpsearch.temperature = 0.0 print("Train") total = 0 temperature = 0.0 while True: for model, sgd, hp in hpsearch: _, (new_model, train_acc, dev_acc) = train_epoch(model, sgd, hp, train_X, train_y, dev_X, dev_y, device_id=-1 if not use_gpu else 0, temperature=hpsearch.temperature) hp = new_model[-1] print( '%d,%d,%d:\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%.3f\t%d\t%d\t%.3f\t%.3f\t%.3f' % (total, hp['epochs'], hp['parent'], hpsearch.best_acc * 100, dev_acc * 100, train_acc * 100, int( hp['batch_size']), hp['dropout'], hp['learn_rate'], hp['width'], hp['depth'], hpsearch.temperature, hpsearch.queue[0][0], hpsearch.queue[-1][0])) total += 1 hpsearch.enqueue(new_model, train_acc, dev_acc)
def main(loc=None, width=128, depth=2, max_batch_size=128, dropout=0.5, dropout_decay=1e-5, nb_epoch=30, use_gpu=False): cfg = dict(locals()) print("Load spaCy") nlp = spacy.load('en', parser=False, entity=False, matcher=False, tagger=False) if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}): # Important trick: text isn't like images, and the best way to use # convolution is different. Don't use pooling-over-time. Instead, # use the window to compute one vector per word, and do this N deep. # In the first layer, we adjust each word vector based on the two # surrounding words --- this gives us essentially trigram vectors. # In the next layer, we have a trigram of trigrams --- so we're # conditioning on information from a five word slice. The third layer # gives us 7 words. This is like the BiLSTM insight: we're not trying # to learn a vector for the whole sentence in this step. We're just # trying to learn better, position-sensitive word features. This simple # convolution step is much more efficient than BiLSTM, and can be # computed in parallel for every token in the batch. mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3) # Comments indicate the output type and shape at each step of the pipeline. # * B: Number of sentences in the batch # * T: Total number of words in the batch # (i.e. sum(len(sent) for sent in batch)) # * W: Width of the network (input hyper-parameter) # * ids: ID for each word (integers). # * lengths: Number of words in each sentence in the batch (integers) # * floats: Standard dense vector. # (Dimensions annotated in curly braces.) sent2vec = ( # List[spacy.token.Doc]{B} #get_word_ids # : List[ids]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem( 0, # : word_ids{T} # This class integrates a linear projection layer, and loads # static embeddings (by default, GloVe common crawl). SpacyVectors(nlp, width) # : floats{T, W} >> mwe_encode**depth # : floats{T, W} ) # : (floats{T, W}, lengths{B}) # Useful trick: Why choose between max pool and mean pool? # We may as well have both representations. >> Pooling(mean_pool, max_pool) # : floats{B, 2*W} ) model = (( (Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) # : floats{B, 4*W} >> Maxout(width, width * 4) # : floats{B, W} >> Maxout(width, width)**depth # : floats{B, W} >> Softmax(3, width) # : floats{B, 2} ) print("Read and parse SNLI data") train, dev = datasets.snli(loc) train_X, train_y = preprocess(model.ops, nlp, train) dev_X, dev_y = preprocess(model.ops, nlp, dev) assert len(dev_y.shape) == 2 print("Initialize with data (LSUV)") with model.begin_training(train_X[:10000], train_y[:10000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = 1 batch_size = 1. print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. backprop(yh - y, optimizer) epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001