Exemplo n.º 1
0
def Tok2Vec(width, embed_size, **kwargs):
    pretrained_vectors = kwargs.get("pretrained_vectors", None)
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators(
        {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
    ):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
        if subword_features:
            prefix = HashEmbed(
                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
            )
            suffix = HashEmbed(
                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
            )
            shape = HashEmbed(
                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
            )
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))

            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape)
                    >> LN(Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        else:
            embed = norm

        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )
        tok2vec = FeatureExtracter(cols) >> with_flatten(
            embed >> convolution ** conv_depth, pad=conv_depth
        )
        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Exemplo n.º 2
0
def Tok2Vec(width, embed_size, **kwargs):
    pretrained_vectors = kwargs.get("pretrained_vectors", None)
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators(
        {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
    ):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
        if subword_features:
            prefix = HashEmbed(
                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
            )
            suffix = HashEmbed(
                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
            )
            shape = HashEmbed(
                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
            )
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))

            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape)
                    >> LN(Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        else:
            embed = norm

        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )
        tok2vec = FeatureExtracter(cols) >> with_flatten(
            embed >> convolution ** conv_depth, pad=conv_depth
        )
        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Exemplo n.º 3
0
def my_tok_to_vec(width, embed_size, pretrained_vectors, **kwargs):
    # Circular imports :(
    from spacy._ml import PyTorchBiLSTM

    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    storage = []
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        # norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
        # prefix = HashEmbed(
        #     width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
        # )
        # suffix = HashEmbed(
        #     width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
        # )
        shape = HashEmbed(
            width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
        )
        glove = Vectors(storage, pretrained_vectors, width, column=cols.index(NORM), )
        vec_width = glove.nV

        embed = uniqued(
            (glove | shape)
            >> LN(Maxout(width, width + vec_width, pieces=3)),
            column=cols.index(ORTH),
        )

        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )

        tok2vec = SaveDoc(storage) >> FeatureExtracter(cols) >> with_flatten(
            embed >> convolution ** conv_depth, pad=conv_depth
        )

        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7

        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Exemplo n.º 4
0
def Doc2Feats(config):
    columns = config["columns"]
    return FeatureExtracter(columns)
Exemplo n.º 5
0
def build_text_classifier(nr_class, width=64, **cfg):
    depth = cfg.get("depth", 2)
    nr_vector = cfg.get("nr_vector", 5000)
    pretrained_dims = cfg.get("pretrained_dims", 0)
    with Model.define_operators({
            ">>": chain,
            "+": add,
            "|": concatenate,
            "**": clone
    }):
        if cfg.get("low_data") and pretrained_dims:
            model = (SpacyVectors >> flatten_add_lengths >> with_getitem(
                0, Affine(width, pretrained_dims)) >>
                     ParametricAttention(width) >> Pooling(sum_pool) >>
                     Residual(ReLu(width, width))**2 >> zero_init(
                         Affine(nr_class, width, drop_factor=0.0)) >> logistic)
            return model

        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width // 2, nr_vector, column=2)
        suffix = HashEmbed(width // 2, nr_vector, column=3)
        shape = HashEmbed(width // 2, nr_vector, column=4)

        trained_vectors = FeatureExtracter(
            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten(
                uniqued(
                    (lower | prefix | suffix | shape) >> LN(
                        Maxout(width, width + (width // 2) * 3)),
                    column=0,
                ))

        if pretrained_dims:
            static_vectors = SpacyVectors >> with_flatten(
                Affine(width, pretrained_dims))
            # TODO Make concatenate support lists
            vectors = concatenate_lists(trained_vectors, static_vectors)
            vectors_width = width * 2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
        tok2vec = vectors >> with_flatten(
            LN(Maxout(width, vectors_width)) >> Residual(
                (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**depth,
            pad=depth,
        )
        cnn_model = (
            tok2vec >> flatten_add_lengths >> ParametricAttention(width) >>
            Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >>
            zero_init(Affine(nr_class, width, drop_factor=0.0)))

        linear_model = build_bow_text_classifier(nr_class,
                                                 ngram_size=cfg.get(
                                                     "ngram_size", 1),
                                                 exclusive_classes=False)
        if cfg.get("exclusive_classes"):
            output_layer = Softmax(nr_class, nr_class * 2)
        else:
            output_layer = (zero_init(
                Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic)
        model = (linear_model | cnn_model) >> output_layer
        model.tok2vec = chain(tok2vec, flatten)
    model.nO = nr_class
    model.lsuv = False
    return model
Exemplo n.º 6
0
def main(nH=6,
         dropout=0.1,
         nS=6,
         nB=64,
         nE=20,
         use_gpu=-1,
         lim=1000000,
         nM=300,
         mL=100,
         save=False,
         nTGT=5000,
         save_name="model.pkl"):
    if use_gpu != -1:
        spacy.require_gpu()
        device = 'cuda'
    else:
        device = 'cpu'
    ''' Read dataset '''
    nlp = spacy.load('en_core_web_sm')
    print('English model loaded')
    for control_token in ("<eos>", "<bos>", "<pad>", "<cls>", "<mask>"):
        nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}])

    train, dev, test = get_iwslt()
    print('Dataset loaded')

    train, _ = zip(*train)
    dev, _ = zip(*dev)
    test, _ = zip(*test)

    train = train[:lim]
    dev = dev[:lim]
    test = test[:lim]
    ''' Tokenize '''
    train = spacy_tokenize(nlp.tokenizer, train, mL=mL)
    dev = spacy_tokenize(nlp.tokenizer, dev, mL=mL)
    test = spacy_tokenize(nlp.tokenizer, test, mL=mL)
    print('Tokenization finished')
    ''' Set rank based on all the docs '''
    all_docs = train + dev + test
    set_rank(nlp.vocab, all_docs, nTGT=nTGT)

    train = set_numeric_ids(nlp.vocab, train)
    dev = set_numeric_ids(nlp.vocab, dev)
    test = set_numeric_ids(nlp.vocab, test)
    print('Numeric ids set')

    word2indx, indx2word = get_dicts(nlp.vocab)
    print('Vocab dictionaries grabbed')

    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(mL, nM)
        model = (FeatureExtracter(attrs=embed_cols) >> with_flatten(
            FancyEmbed(nM, nTGT, cols=embed_cols)) >> Residual(position_encode)
                 >> create_model_input() >> Encoder(
                     nM=nM, nS=nS, nH=nH, device=device) >> with_reshape(
                         Softmax(nO=nTGT, nI=nM)))
        ''' Progress tracking '''
        losses = [0.]
        train_accuracies = [0.]
        train_totals = [0.]
        dev_accuracies = [0.]
        dev_loss = [0.]

        def track_progress():
            correct = 0.
            total = 0.
            ''' Get dev stats '''
            for X0 in minibatch(dev, size=nB):
                X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL)
                Xh = model(X1)
                L, C, t = get_loss(Xh, X0, X1, loss_mask)
                correct += C
                total += t
                dev_loss[-1] += (L**2).sum()
            dev_accuracies[-1] = correct / total
            print(len(losses), losses[-1],
                  train_accuracies[-1] / train_totals[-1], dev_loss[-1],
                  dev_accuracies[-1])
            dev_loss.append(0.)
            losses.append(0.)
            train_accuracies.append(0.)
            dev_accuracies.append(0.)
            train_totals.append(0.)
            if save:
                model.to_disk('.models/' + save_name)

        ''' Model training '''
        with model.begin_training(batch_size=nB,
                                  nb_epoch=nE) as (trainer, optimizer):
            trainer.dropout = dropout
            trainer.dropout_decay = 1e-4
            optimizer.alpha = 0.001
            optimizer.L2 = 1e-6
            optimizer.max_grad_norm = 1.0
            trainer.each_epoch.append(track_progress)
            optimizer.alpha = 0.001
            optimizer.L2 = 1e-6
            optimizer.max_grad_norm = 1.0
            for X0, _ in trainer.iterate(train, train):
                X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL)
                Xh, backprop = model.begin_update(X1, drop=dropout)
                dXh, C, total = get_loss(Xh, X0, X1, loss_mask)
                backprop(dXh, sgd=optimizer)
                losses[-1] += (dXh**2).sum()
                train_accuracies[-1] += C
                train_totals[-1] += total
Exemplo n.º 7
0
def main(nH=6, dropout=0.1, nS=6, nB=15, nE=20, use_gpu=-1, lim=2000):
    if use_gpu != -1:
        # TODO: Make specific to different devices, e.g. 1 vs 0
        spacy.require_gpu()
    train, dev, test = get_iwslt()
    train_X, train_Y = zip(*train)
    dev_X, dev_Y = zip(*dev)
    test_X, test_Y = zip(*test)
    ''' Read dataset '''
    nlp_en = spacy.load('en_core_web_sm')
    nlp_de = spacy.load('de_core_news_sm')
    print('Models loaded')
    for control_token in ("<eos>", "<bos>", "<pad>"):
        nlp_en.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
        nlp_de.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
    train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                      train_X[-lim:], train_Y[-lim:],
                                      MAX_LENGTH)
    dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                  dev_X[-lim:], dev_Y[-lim:], MAX_LENGTH)
    test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                    test_X[-lim:], test_Y[-lim:], MAX_LENGTH)
    train_X = set_numeric_ids(nlp_en.vocab, train_X, vocab_size=VOCAB_SIZE)
    train_Y = set_numeric_ids(nlp_de.vocab, train_Y, vocab_size=VOCAB_SIZE)
    nTGT = VOCAB_SIZE

    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(MAX_LENGTH, MODEL_SIZE)
        model = (apply_layers(extractor, extractor) >> apply_layers(
            with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)),
            with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)),
        ) >> apply_layers(Residual(position_encode), Residual(position_encode))
                 >> create_batch() >> EncoderDecoder(nS=nS, nH=nH, nTGT=nTGT))

    losses = [0.]
    train_accuracies = [0.]
    train_totals = [0.]
    dev_accuracies = [0.]
    dev_loss = [0.]

    def track_progress():
        correct = 0.
        total = 0.
        for batch in minibatch(zip(dev_X, dev_Y), size=1024):
            X, Y = zip(*batch)
            Yh, Y_mask = model((X, Y))
            L, C = get_loss(model.ops, Yh, Y, Y_mask)
            correct += C
            dev_loss[-1] += (L**2).sum()
            total += len(Y)
        dev_accuracies[-1] = correct / total
        n_train = train_totals[-1]
        print(len(losses), losses[-1], train_accuracies[-1] / n_train,
              dev_loss[-1], dev_accuracies[-1])
        dev_loss.append(0.)
        losses.append(0.)
        train_accuracies.append(0.)
        dev_accuracies.append(0.)
        train_totals.append(0.)

    with model.begin_training(batch_size=nB,
                              nb_epoch=nE) as (trainer, optimizer):
        trainer.dropout = dropout
        trainer.dropout_decay = 1e-4
        trainer.each_epoch.append(track_progress)
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        for X, Y in trainer.iterate(train_X, train_Y):
            (Yh, X_mask), backprop = model.begin_update((X, Y), drop=dropout)
            dYh, C = get_loss(model.ops, Yh, Y, X_mask)
            backprop(dYh, sgd=optimizer)
            losses[-1] += (dYh**2).sum()
            train_accuracies[-1] += C
            train_totals[-1] += sum(len(y) for y in Y)
Exemplo n.º 8
0
def main(use_gpu=False, nb_epoch=100):
    if use_gpu:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    train, test = datasets.imdb(limit=2000)
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2))
    test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2))

    nlp = spacy.load("en_vectors_web_lg")
    nlp.add_pipe(nlp.create_pipe("sentencizer"), first=True)

    preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
    train_X = [
        preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))
    ]
    test_X = [
        preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))
    ]

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
    train_X = train_X[:-1000]
    train_y = train_y[:-1000]
    print("Parse data")
    n_sent = sum([len(list(sents)) for sents in train_X])
    print("%d sentences" % n_sent)

    model = build_model(2,
                        width=128,
                        conv_depth=2,
                        depth=2,
                        train_X=train_X,
                        train_y=train_y)
    with model.begin_training(train_X[:100],
                              train_y[:100]) as (trainer, optimizer):
        epoch_loss = [0.0]

        def report_progress():
            with model.use_params(optimizer.averages):
                print(
                    epoch_loss[-1],
                    epoch_var[-1],
                    model.evaluate(dev_X, dev_y),
                    trainer.dropout,
                )
            epoch_loss.append(0.0)
            epoch_var.append(0.0)

        trainer.each_epoch.append(report_progress)
        batch_sizes = compounding(64, 64, 1.01)
        trainer.dropout = 0.3
        trainer.batch_size = int(next(batch_sizes))
        trainer.dropout_decay = 0.0
        trainer.nb_epoch = nb_epoch
        # optimizer.alpha = 0.1
        # optimizer.max_grad_norm = 10.0
        # optimizer.b1 = 0.0
        # optimizer.b2 = 0.0
        epoch_var = [0.0]
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            losses = ((yh - y)**2.0).sum(axis=1) / y.shape[0]
            epoch_var[-1] += losses.var()
            loss = losses.mean()
            backprop((yh - y) / yh.shape[0], optimizer)
            epoch_loss[-1] += loss
            trainer.batch_size = int(next(batch_sizes))
        with model.use_params(optimizer.averages):
            print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y))
Exemplo n.º 9
0
def main(nH=6,
         dropout=0.0,
         nS=6,
         nB=32,
         nE=20,
         use_gpu=-1,
         lim=2000,
         nM=300,
         mL=100,
         save=False,
         save_name="model.pkl"):
    if use_gpu != -1:
        # TODO: Make specific to different devices, e.g. 1 vs 0
        spacy.require_gpu()
        device = 'cuda'
    else:
        device = 'cpu'
    ''' Read dataset '''
    nlp = spacy.load('en_core_web_sm')
    for control_token in ("<eos>", "<bos>", "<pad>", "<cls>"):
        nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}])
    train, dev = imdb(limit=lim)
    print('Loaded imdb dataset')
    train = train[:lim]
    dev = dev[:lim]
    train_X, train_Y = zip(*train)
    dev_X, dev_Y = zip(*dev)
    train_X = spacy_tokenize(nlp.tokenizer, train_X, mL=mL)
    dev_X = spacy_tokenize(nlp.tokenizer, dev_X, mL=mL)
    print('Tokenized dataset')
    train_X = set_numeric_ids(nlp.vocab, train_X)
    dev_X = set_numeric_ids(nlp.vocab, dev_X)
    print('Numeric ids ready')
    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(mL, nM)
        model = (FeatureExtracter(attrs=embed_cols) >> with_flatten(
            FancyEmbed(nM, 5000, cols=embed_cols)) >> Residual(position_encode)
                 >> create_model_input() >> Categorizer(
                     nM=nM, nS=nS, nH=nH, device=device))

    losses = [0.]
    train_accuracies = [0.]
    train_totals = [0.]
    dev_accuracies = [0.]
    dev_loss = [0.]

    def track_progress():
        correct = 0.
        total = 0.
        for batch in minibatch(zip(dev_X, dev_Y), size=1024):
            X, Y = zip(*batch)
            Yh = model(X)
            L, C = get_loss(Yh, Y)
            correct += C
            dev_loss[-1] += (L**2).sum()
            total += len(X)
        dev_accuracies[-1] = correct / total
        n_train = train_totals[-1]
        print(len(losses), losses[-1], train_accuracies[-1] / n_train,
              dev_loss[-1], dev_accuracies[-1])
        dev_loss.append(0.)
        losses.append(0.)
        train_accuracies.append(0.)
        dev_accuracies.append(0.)
        train_totals.append(0.)

    with model.begin_training(batch_size=nB,
                              nb_epoch=nE) as (trainer, optimizer):
        trainer.dropout = dropout
        trainer.dropout_decay = 1e-4
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        trainer.each_epoch.append(track_progress)
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
        for X, Y in trainer.iterate(train_X, train_Y):
            Yh, backprop = model.begin_update(X)
            dYh, C = get_loss(Yh, Y)
            backprop(dYh, sgd=optimizer)
            losses[-1] += (dYh**2).sum()
            train_accuracies[-1] += C
            train_totals[-1] += len(Y)
    if save:
        model.to_disk(save_name)
Exemplo n.º 10
0
def main(nH=6,
         dropout=0.0,
         nS=6,
         nB=32,
         nE=20,
         use_gpu=-1,
         lim=2000,
         nM=300,
         mL=20,
         nTGT=3500,
         save=False,
         load=False,
         save_name="model.pkl",
         load_name="model.pkl"):
    if use_gpu != -1:
        # TODO: Make specific to different devices, e.g. 1 vs 0
        spacy.require_gpu()
        device = 'cuda'
    else:
        device = 'cpu'
    train, dev, test = get_iwslt()
    train_X, train_Y = zip(*train)
    dev_X, dev_Y = zip(*dev)
    test_X, test_Y = zip(*test)
    ''' Read dataset '''
    nlp_en = spacy.load('en_core_web_sm')
    nlp_de = spacy.load('de_core_news_sm')
    print('Models loaded')
    for control_token in ("<eos>", "<bos>", "<pad>"):
        nlp_en.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
        nlp_de.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
    train_lim = min(lim, len(train_X))
    dev_lim = min(lim, len(dev_X))
    test_lim = min(lim, len(test_X))
    train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                      train_X[:train_lim], train_Y[:train_lim],
                                      mL)
    dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                  dev_X[:dev_lim], dev_Y[:dev_lim], mL)
    test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                    test_X[:test_lim], test_Y[:test_lim], mL)
    all_X_docs = train_X + dev_X + test_X
    all_y_docs = train_Y + dev_Y + test_Y
    set_rank(nlp_en.vocab, all_X_docs, nTGT=nTGT)
    set_rank(nlp_de.vocab, all_y_docs, nTGT=nTGT)
    train_X = set_numeric_ids(nlp_en.vocab, train_X)
    dev_X = set_numeric_ids(nlp_en.vocab, dev_X)
    test_X = set_numeric_ids(nlp_en.vocab, test_X)
    train_Y = set_numeric_ids(nlp_de.vocab, train_Y)
    dev_Y = set_numeric_ids(nlp_de.vocab, dev_Y)
    test_Y = set_numeric_ids(nlp_de.vocab, test_Y)

    en_word2indx, en_indx2word = get_dicts(nlp_en.vocab)
    de_word2indx, de_indx2word = get_dicts(nlp_de.vocab)
    nTGT += 1

    if not load:
        with Model.define_operators({">>": chain}):
            embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
            extractor = FeatureExtracter(attrs=embed_cols)
            position_encode = PositionEncode(mL, nM)
            model = (apply_layers(extractor, extractor) >> apply_layers(
                with_flatten(FancyEmbed(nM, 5000, cols=embed_cols)),
                with_flatten(FancyEmbed(nM, 5000, cols=embed_cols)),
            ) >> apply_layers(Residual(position_encode),
                              Residual(position_encode)) >> create_batch() >>
                     EncoderDecoder(
                         nS=nS, nH=nH, nTGT=nTGT, nM=nM, device=device))
    else:
        model = Model.from_disk(load_name)

    losses = [0.]
    train_accuracies = [0.]
    train_totals = [0.]
    dev_accuracies = [0.]
    dev_loss = [0.]

    def track_progress():
        correct = 0.
        total = 0.
        for batch in minibatch(zip(dev_X, dev_Y), size=1024):
            X, Y = zip(*batch)
            Yh, Y_mask = model((X, Y))
            L, C, total = get_loss(model.ops, Yh, Y, Y_mask)
            correct += C
            dev_loss[-1] += (L**2).sum()
        dev_accuracies[-1] = correct / total
        n_train = train_totals[-1]
        print(len(losses), losses[-1], train_accuracies[-1] / n_train,
              dev_loss[-1], dev_accuracies[-1])
        dev_loss.append(0.)
        losses.append(0.)
        train_accuracies.append(0.)
        dev_accuracies.append(0.)
        train_totals.append(0.)

    with model.begin_training(batch_size=nB,
                              nb_epoch=nE) as (trainer, optimizer):
        trainer.dropout = dropout
        trainer.dropout_decay = 1e-4
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        trainer.each_epoch.append(track_progress)
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        for X, Y in trainer.iterate(train_X, train_Y):
            (Yh, X_mask), backprop = model.begin_update((X, Y))
            dYh, C, total = get_loss(model.ops, Yh, Y, X_mask)
            backprop(dYh, sgd=optimizer)
            losses[-1] += (dYh**2).sum()
            train_accuracies[-1] += C
            train_totals[-1] += total
    if save:
        model.to_disk(save_name)
Exemplo n.º 11
0
def Tok2Vec(width, embed_size, **kwargs):
    # Circular imports :(
    from .._ml import CharacterEmbed
    from .._ml import PyTorchBiLSTM

    pretrained_vectors = kwargs.get("pretrained_vectors", None)
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    char_embed = kwargs.get("char_embed", False)
    if char_embed:
        subword_features = False
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        norm = HashEmbed(width,
                         embed_size,
                         column=cols.index(NORM),
                         name="embed_norm",
                         seed=6)
        if subword_features:
            prefix = HashEmbed(width,
                               embed_size // 2,
                               column=cols.index(PREFIX),
                               name="embed_prefix",
                               seed=7)
            suffix = HashEmbed(width,
                               embed_size // 2,
                               column=cols.index(SUFFIX),
                               name="embed_suffix",
                               seed=8)
            shape = HashEmbed(width,
                              embed_size // 2,
                              column=cols.index(SHAPE),
                              name="embed_shape",
                              seed=9)
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors,
                                  width,
                                  column=cols.index(ID))

            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape) >> LN(
                        Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            elif char_embed:
                embed = concatenate_lists(
                    CharacterEmbed(nM=64, nC=8),
                    FeatureExtracter(cols) >> with_flatten(glove),
                )
                reduce_dimensions = LN(
                    Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces))
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape) >> LN(
                    Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        elif char_embed:
            embed = concatenate_lists(
                CharacterEmbed(nM=64, nC=8),
                FeatureExtracter(cols) >> with_flatten(norm),
            )
            reduce_dimensions = LN(
                Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces))
        else:
            embed = norm

        convolution = Residual(
            ExtractWindow(
                nW=1) >> LN(Maxout(width, width *
                                   3, pieces=cnn_maxout_pieces)))
        if char_embed:
            tok2vec = embed >> with_flatten(
                reduce_dimensions >> convolution**conv_depth, pad=conv_depth)
        else:
            tok2vec = FeatureExtracter(cols) >> with_flatten(
                embed >> convolution**conv_depth, pad=conv_depth)

        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec