def create_pretraining_model(nlp, tok2vec, objective="basic"): """Define a network for the pretraining.""" output_size = nlp.vocab.vectors.data.shape[1] # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann # "tok2vec" has to be the same set of processes as what the components do. with Model.define_operators({">>": chain, "|": concatenate}): l2r_model = ( tok2vec.l2r >> flatten >> LN(Maxout(output_size, tok2vec.l2r.nO, pieces=3)) >> zero_init(Affine(output_size, drop_factor=0.0)) ) r2l_model = ( tok2vec.r2l >> flatten >> LN(Maxout(output_size, tok2vec.r2l.nO, pieces=3)) >> zero_init(Affine(output_size, drop_factor=0.0)) ) model = tok2vec.embed >> (l2r_model | r2l_model) model.tok2vec = tok2vec model.begin_training([nlp.make_doc("Give it a doc to infer shapes")]) tok2vec.begin_training([nlp.make_doc("Give it a doc to infer shapes")]) tokvecs = tok2vec([nlp.make_doc('hello there'), nlp.make_doc(u'and hello')]) print(tokvecs.shape) return model
def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({ '>>': chain, '+': add, '|': concatenate, '**': clone }): if cfg.get('low_data') and pretrained_dims: model = (SpacyVectors >> flatten_add_lengths >> with_getitem( 0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width))**2 >> zero_init( Affine(nr_class, width, drop_factor=0.0)) >> logistic) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = (FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued((lower | prefix | suffix | shape) >> LN( Maxout(width, width + (width // 2) * 3)), column=0))) if pretrained_dims: static_vectors = ( SpacyVectors >> with_flatten(Affine(width, pretrained_dims))) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None cnn_model = ( vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**2, pad=2) >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0))) linear_model = ( _preprocess_doc >> LinearModel(nr_class, drop_factor=0.)) model = ((linear_model | cnn_model) >> zero_init( Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic) model.nO = nr_class model.lsuv = False return model
def fine_tune_class_vector(nr_class, *, exclusive_classes=True, **cfg): """Select features from the class-vectors from the last hidden state, softmax them, and then mean-pool them to produce one feature per vector. The gradients of the class vectors are incremented in the backward pass, to allow fine-tuning. """ return chain( get_pytt_class_tokens, flatten_add_lengths, with_getitem( 0, chain(Affine(cfg["token_vector_width"], cfg["token_vector_width"]), tanh)), Pooling(mean_pool), Affine(2, cfg["token_vector_width"], drop_factor=0), softmax)
def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10): """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. """ if objective == "characters": out_sizes = [256] * nr_char output_layer = chain(LN(Maxout(300, pieces=3)), MultiSoftmax(out_sizes, 300)) else: output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain(LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann # "tok2vec" has to be the same set of processes as what the components do. tok2vec = chain(tok2vec, flatten) model = chain(tok2vec, output_layer) model = masked_language_model(nlp.vocab, model) model.tok2vec = tok2vec model.output_layer = output_layer model.begin_training([nlp.make_doc("Give it a doc to infer shapes")]) return model
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): if "entity_width" not in cfg: raise ValueError(Errors.E144.format(param="entity_width")) conv_depth = cfg.get("conv_depth", 2) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) pretrained_vectors = cfg.get("pretrained_vectors", None) context_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): # context encoder tok2vec = Tok2Vec( width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth, bilstm_depth=0, ) model = ( tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> Residual( zero_init(Maxout(hidden_width, hidden_width))) >> zero_init( Affine(context_width, hidden_width, drop_factor=0.0))) model.tok2vec = tok2vec model.nO = context_width return model
def test_unwrapped(nN=2, nI=3, nO=4): if PyTorchWrapper is None: return model = Affine(nO, nI) X = numpy.zeros((nN, nI), dtype="f") X += numpy.random.uniform(size=X.size).reshape(X.shape) sgd = SGD(model.ops, 0.001) Y = numpy.zeros((nN, nO), dtype="f") check_learns_zero_output(model, sgd, X, Y)
def sigmoid_last_hidden(nr_class, *, exclusive_classes=False, **cfg): width = cfg["token_vector_width"] return chain( get_last_hidden, flatten_add_lengths, Pooling(mean_pool), zero_init(Affine(nr_class, width, drop_factor=0.0)), logistic, )
def softmax_tanh_class_vector(nr_class, *, exclusive_classes=True, **cfg): """Select features from the class-vectors from the last hidden state, mean-pool them, and softmax to produce one vector per document. The gradients of the class vectors are incremented in the backward pass, to allow fine-tuning. """ width = cfg["token_vector_width"] return chain(get_pytt_class_tokens, flatten_add_lengths, with_getitem(0, chain(Affine(width, width), tanh)), Pooling(mean_pool), Softmax(2, width))
def Model(cls, nr_class, **cfg): depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) subword_features = util.env_opt('subword_features', cfg.get('subword_features', True)) conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4)) conv_window = util.env_opt('conv_window', cfg.get('conv_depth', 1)) t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) assert depth == 1 parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 96)) hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) tok2vec = get_t2v(token_vector_width, embed_size, conv_depth=conv_depth, conv_window=conv_window, cnn_maxout_pieces=t2v_pieces, subword_features=subword_features, bilstm_depth=bilstm_depth) tok2vec = chain(tok2vec, flatten) tok2vec.nO = token_vector_width lower = PrecomputableAffine(hidden_width, nF=cls.nr_feature, nI=token_vector_width, nP=parser_maxout_pieces) lower.nP = parser_maxout_pieces with Model.use_device('cpu'): upper = Affine(nr_class, hidden_width, drop_factor=0.0) upper.W *= 0 cfg = { 'nr_class': nr_class, 'hidden_depth': depth, 'token_vector_width': token_vector_width, 'hidden_width': hidden_width, 'maxout_pieces': parser_maxout_pieces, 'pretrained_vectors': None, 'bilstm_depth': bilstm_depth, 'self_attn_depth': self_attn_depth, 'conv_depth': conv_depth, 'conv_window': conv_window, 'embed_size': embed_size, 'cnn_maxout_pieces': t2v_pieces } return ParserModel(tok2vec, lower, upper), cfg
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): # TODO proper error if "entity_width" not in cfg: raise ValueError("entity_width not found") if "context_width" not in cfg: raise ValueError("context_width not found") conv_depth = cfg.get("conv_depth", 2) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) pretrained_vectors = cfg.get( "pretrained_vectors") # self.nlp.vocab.vectors.name context_width = cfg.get("context_width") entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): model = ( Affine(entity_width, entity_width + context_width + 1 + ner_types) >> Affine(1, entity_width, drop_factor=0.0) >> logistic) # context encoder tok2vec = (Tok2Vec( width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth, bilstm_depth=0, ) >> flatten_add_lengths >> Pooling(mean_pool) >> Residual( zero_init(Maxout(hidden_width, hidden_width))) >> zero_init( Affine(context_width, hidden_width))) model.tok2vec = tok2vec model.tok2vec = tok2vec model.tok2vec.nO = context_width model.nO = 1 return model
def build_model(nr_class, width, depth, conv_depth, vectors_name, **kwargs): with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}): embed = (HashEmbed(width, 5000, column=1) | StaticVectors(vectors_name, width, column=5) | HashEmbed(width // 2, 750, column=2) | HashEmbed(width // 2, 750, column=3) | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width)) sent2vec = (with_flatten(embed) >> Residual( prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >> MultiHeadedAttention() >> with_flatten( Maxout(width, width, pieces=3))) >> flatten_add_lengths >> ParametricAttention(width, hard=False) >> Pooling(mean_pool) >> Residual(LN(Maxout(width)))) model = (foreach(sent2vec, drop_factor=2.0) >> Residual( prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >> MultiHeadedAttention() >> with_flatten(LN(Affine(width, width)))) >> flatten_add_lengths >> ParametricAttention( width, hard=False) >> Pooling(mean_pool) >> Residual( LN(Maxout(width)))**2 >> Softmax(nr_class)) model.lsuv = False return model
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg): """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ with Model.define_operators({">>": chain}): if exclusive_classes: output_layer = Softmax(nr_class, tok2vec.nO) else: output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class return model
def build_text_classifier(nr_class, width=64, **cfg): depth = cfg.get("depth", 2) nr_vector = cfg.get("nr_vector", 5000) pretrained_dims = cfg.get("pretrained_dims", 0) with Model.define_operators({ ">>": chain, "+": add, "|": concatenate, "**": clone }): if cfg.get("low_data") and pretrained_dims: model = (SpacyVectors >> flatten_add_lengths >> with_getitem( 0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width))**2 >> zero_init( Affine(nr_class, width, drop_factor=0.0)) >> logistic) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN( Maxout(width, width + (width // 2) * 3)), column=0, )) if pretrained_dims: static_vectors = SpacyVectors >> with_flatten( Affine(width, pretrained_dims)) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None tok2vec = vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**depth, pad=depth, ) cnn_model = ( tok2vec >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0))) linear_model = build_bow_text_classifier(nr_class, ngram_size=cfg.get( "ngram_size", 1), exclusive_classes=False) if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = (zero_init( Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic) model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False return model
def create_model(nr_in, nr_out): return Affine(nr_in, nr_out)
def affine(): return Affine(5, 3)
def get_model(W_values, b_values): model = Affine(W_values.shape[0], W_values.shape[1], ops=NumpyOps()) model.initialize_params() model.W[:] = W_values model.b[:] = b_values return model
def affine_output(nO, nI, drop_factor, **cfg): return Affine(nO, nI, drop_factor=drop_factor)
def tensor_affine_tok2vec(output_size, tensor_size, **cfg): return chain(get_tensors, flatten, Affine(output_size, tensor_size))