def LangIdentifierModelV2( ns: Sequence[int] = (1, 2, 3), embed_dim: int = 100, hidden_width: int = 512, dropout: Optional[float] = 0.1, ) -> Model[List[str], thinc.types.Floats2d]: """ Build a language identification model inspired by Google's CLD3. Args: ns: Set of "n" for which character "n"-grams are extracted from input texts. If 1, only unigrams (single characters) are used; if [1, 2], then both unigrams and bigrams are used; and so on. embed_dim: Size of the vectors into which each set of ngrams are embedded. hidden_width: Width of the dense layer with Relu activation, just before the final prediction (Softmax) layer. dropout: Dropout rate to avoid overfitting. Returns: Thinc :class:`Model`. """ with Model.define_operators({">>": chain}): model = (MultiCharNgramsEmbedding( ns=list(ns), max_chars=1000, lower=True, num_vectors=[2000 * n for n in ns], embed_dims=embed_dim, dropout=dropout, ) >> thinc.layers.Relu( nI=embed_dim * len(ns), nO=hidden_width, dropout=dropout, ) >> thinc.layers.Softmax(nI=hidden_width)) return model
def TextCatCNN_v1(tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None) -> Model[List[Doc], Floats2d]: """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ chain = registry.get("layers", "chain.v1") reduce_mean = registry.get("layers", "reduce_mean.v1") Logistic = registry.get("layers", "Logistic.v1") Softmax = registry.get("layers", "Softmax.v1") Linear = registry.get("layers", "Linear.v1") list2ragged = registry.get("layers", "list2ragged.v1") with Model.define_operators({">>": chain}): cnn = tok2vec >> list2ragged() >> reduce_mean() if exclusive_classes: output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = cnn >> output_layer model.set_ref("output_layer", output_layer) else: linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) model = cnn >> linear_layer >> Logistic() model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) model.attrs["multi_label"] = not exclusive_classes return model
def MultiCharNgramsEmbedding( ns: List[int], max_chars: int, lower: bool, num_vectors: int | List[int], embed_dims: int | List[int], dropout: Optional[float], ) -> Model[List[str], thinc.types.Floats1d]: """ Args: ns max_chars lower num_vectors embed_dims dropout """ numn = len(ns) num_vectors = [num_vectors] * numn if isinstance(num_vectors, int) else num_vectors embed_dims = [embed_dims] * numn if isinstance(embed_dims, int) else embed_dims with Model.define_operators({">>": chain}): model = concatenate(*[ CharNgramsEmbedding( n=n, max_chars=max_chars, lower=lower, num_vectors=nvec, embed_dim=edim, dropout=dropout, ) for n, nvec, edim in zip(ns, num_vectors, embed_dims) ]) return model
def build_bow_text_classifier( exclusive_classes: bool, ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: fill_defaults = {"b": 0, "W": 0} with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO=nO) output_layer = None if not no_output_layer: fill_defaults["b"] = NEG_VALUE output_layer = softmax_activation() if exclusive_classes else Logistic() resizable_layer = resizable( sparse_linear, resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), ) model = extract_ngrams(ngram_size, attr=ORTH) >> resizable_layer model = with_cpu(model, model.ops) if output_layer: model = model >> with_cpu(output_layer, output_layer.ops) model.set_dim("nO", nO) model.set_ref("output_layer", sparse_linear) model.attrs["multi_label"] = not exclusive_classes model.attrs["resize_output"] = partial( resize_and_set_ref, resizable_layer=resizable_layer ) return model
def cnn_tagger(width: int, vector_width: int, nr_classes: int = 17): with Model.define_operators({">>": chain}): model = strings2arrays() >> with_array( HashEmbed(nO=width, nV=vector_width, column=0) >> expand_window( window_size=1) >> Relu(nO=width, nI=width * 3) >> Relu( nO=width, nI=width) >> Softmax(nO=nr_classes, nI=width)) return model
def build_text_classifier_v2( tok2vec: Model[List[Doc], List[Floats2d]], linear_model: Model[List[Doc], Floats2d], nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: exclusive_classes = not linear_model.attrs["multi_label"] with Model.define_operators({">>": chain, "|": concatenate}): width = tok2vec.maybe_get_dim("nO") attention_layer = ParametricAttention( width) # TODO: benchmark performance difference of this layer maxout_layer = Maxout(nO=width, nI=width) norm_layer = LayerNorm(nI=width) cnn_model = ( tok2vec >> list2ragged() >> attention_layer >> reduce_sum() >> residual(maxout_layer >> norm_layer >> Dropout(0.0))) nO_double = nO * 2 if nO else None if exclusive_classes: output_layer = Softmax(nO=nO, nI=nO_double) else: output_layer = Linear(nO=nO, nI=nO_double) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: model.set_dim("nO", nO) model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.set_ref("attention_layer", attention_layer) model.set_ref("maxout_layer", maxout_layer) model.set_ref("norm_layer", norm_layer) model.attrs["multi_label"] = not exclusive_classes model.init = init_ensemble_textcat return model
def test_tuplify_operator_three(model1, model2, model3): # Previously we 'flattened' these nested calls. We might opt to do so # again, especially for the operators. with Model.define_operators({"&": tuplify}): model = model1 & model2 & model3 assert len(model.layers) == 2 assert len(model.layers[0].layers) == 2
def CharNgramsEmbedding( n: int, max_chars: int, lower: bool, num_vectors: int, embed_dim: int, dropout: Optional[float], ) -> Model[List[str], thinc.types.Floats1d]: """ Args: n max_chars lower num_vectors embed_dim dropout """ with Model.define_operators({">>": chain}): model = ( text_to_char_ngrams(n, max_chars, lower) >> thinc.layers.strings2arrays() >> thinc.layers.with_array( thinc.layers.HashEmbed( nO=embed_dim, nV=num_vectors, dropout=dropout, column=0, )) >> thinc.layers.list2ragged() >> thinc.layers.reduce_mean()) return model
def create_relation_model( create_instance_tensor: Model[List[Doc], Floats2d], classification_layer: Model[Floats2d, Floats2d], ) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain}): model = create_instance_tensor >> classification_layer model.attrs["get_instances"] = create_instance_tensor.attrs["get_instances"] return model
def test_plus_chain(): with Model.define_operators({"+": lambda a, b: a}): m = ( create_model(name="a") + create_model(name="b") + create_model(name="c") + create_model(name="d") ) assert m.name == "a"
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: with Model.define_operators({">>": chain, "**": clone}): token_width = tok2vec.get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) model = (tok2vec >> list2ragged() >> reduce_mean() >> residual( Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) >> output_layer) model.set_ref("output_layer", output_layer) model.set_ref("tok2vec", tok2vec) return model
def _overload_plus(operator, sleep): m1 = create_model(name="a") m2 = create_model(name="b") with Model.define_operators({operator: lambda a, b: a.name + b.name}): time.sleep(sleep) if operator == "+": value = m1 + m2 else: value = m1 * m2 assert value == "ab" assert Model._context_operators.get() == {}
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain, "&": tuplify}): token_width = tok2vec.maybe_get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) model = (((tok2vec >> list2ragged()) & build_span_maker()) >> extract_spans() >> reduce_mean() >> residual( Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) >> output_layer) model.set_ref("output_layer", output_layer) model.set_ref("tok2vec", tok2vec) # flag to show this isn't legacy model.attrs["include_span_maker"] = True return model
def build_text_classifier_lowdata( width: int, dropout: Optional[float], nO: Optional[int] = None) -> Model[List[Doc], Floats2d]: # Don't document this yet, I'm not sure it's right. # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): model = (StaticVectors(width) >> list2ragged() >> ParametricAttention(width) >> reduce_sum() >> residual( Relu(width, width))**2 >> Linear(nO, width)) if dropout: model = model >> Dropout(dropout) model = model >> Logistic() return model
def test_nested_operator_contexts(): m1 = create_model(name="a") m2 = create_model(name="b") assert Model._context_operators.get() == {} with Model.define_operators({"+": lambda a, b: a.name + b.name}): value = m1 + m2 with pytest.raises(TypeError): value = m1 * m2 with Model.define_operators({"*": lambda a, b: a.name + b.name}): with pytest.raises(TypeError): value = m1 + m2 value = m1 * m2 with Model.define_operators({"-": lambda a, b: a.name + b.name}): with pytest.raises(TypeError): value = m1 + m2 value = m1 - m2 with pytest.raises(TypeError): value = m1 + m2 value = m1 * m2 value = m1 + m2 with pytest.raises(TypeError): value = m1 * m2 assert value == "ab" assert Model._context_operators.get() == {}
def build_bow_text_classifier( exclusive_classes: bool, ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO) model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear model = with_cpu(model, model.ops) if not no_output_layer: output_layer = softmax_activation() if exclusive_classes else Logistic() model = model >> with_cpu(output_layer, output_layer.ops) model.set_ref("output_layer", sparse_linear) model.attrs["multi_label"] = not exclusive_classes return model
def build_simple_cnn_text_classifier( tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None) -> Model[List[Doc], Floats2d]: """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ fill_defaults = {"b": 0, "W": 0} with Model.define_operators({">>": chain}): cnn = tok2vec >> list2ragged() >> reduce_mean() nI = tok2vec.maybe_get_dim("nO") if exclusive_classes: output_layer = Softmax(nO=nO, nI=nI) fill_defaults["b"] = NEG_VALUE resizable_layer: Model = resizable( output_layer, resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), ) model = cnn >> resizable_layer else: output_layer = Linear(nO=nO, nI=nI) resizable_layer = resizable( output_layer, resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), ) model = cnn >> resizable_layer >> Logistic() model.set_ref("output_layer", output_layer) model.attrs["resize_output"] = partial( resize_and_set_ref, resizable_layer=resizable_layer, ) model.set_ref("tok2vec", tok2vec) model.set_dim( "nO", nO ) # type: ignore # TODO: remove type ignore once Thinc has been updated model.attrs["multi_label"] = not exclusive_classes return model
def TextCatBOW_v1( exclusive_classes: bool, ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: chain = registry.get("layers", "chain.v1") Logistic = registry.get("layers", "Logistic.v1") SparseLinear = registry.get("layers", "SparseLinear.v1") softmax_activation = registry.get("layers", "softmax_activation.v1") with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO) model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear model = with_cpu(model, model.ops) if not no_output_layer: output_layer = softmax_activation( ) if exclusive_classes else Logistic() model = model >> with_cpu(output_layer, output_layer.ops) model.set_ref("output_layer", sparse_linear) model.attrs["multi_label"] = not exclusive_classes return model
def create_embed_relu_relu_softmax(depth, width, vector_length): with Model.define_operators({">>": chain}): model = strings2arrays() >> with_array( HashEmbed(width, vector_length) >> expand_window(window_size=1) >> ReLu(width, width * 3) >> ReLu(width, width) >> Softmax(17, width)) return model
def create_classification_layer(nO: int = None, nI: int = None) -> Model[Floats2d, Floats2d]: with Model.define_operators({">>": chain}): return Linear(nO=nO, nI=nI) >> Logistic()
def TextCatEnsemble_v1( width: int, embed_size: int, pretrained_vectors: Optional[bool], exclusive_classes: bool, ngram_size: int, window_size: int, conv_depth: int, dropout: Optional[float], nO: Optional[int] = None, ) -> Model: # Don't document this yet, I'm not sure it's right. HashEmbed = registry.get("layers", "HashEmbed.v1") FeatureExtractor = registry.get("layers", "spacy.FeatureExtractor.v1") Maxout = registry.get("layers", "Maxout.v1") StaticVectors = registry.get("layers", "spacy.StaticVectors.v1") Softmax = registry.get("layers", "Softmax.v1") Linear = registry.get("layers", "Linear.v1") ParametricAttention = registry.get("layers", "ParametricAttention.v1") Dropout = registry.get("layers", "Dropout.v1") Logistic = registry.get("layers", "Logistic.v1") build_bow_text_classifier = registry.get("architectures", "spacy.TextCatBOW.v1") list2ragged = registry.get("layers", "list2ragged.v1") chain = registry.get("layers", "chain.v1") concatenate = registry.get("layers", "concatenate.v1") clone = registry.get("layers", "clone.v1") reduce_sum = registry.get("layers", "reduce_sum.v1") with_array = registry.get("layers", "with_array.v1") uniqued = registry.get("layers", "uniqued.v1") residual = registry.get("layers", "residual.v1") expand_window = registry.get("layers", "expand_window.v1") cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10) prefix = HashEmbed( nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11, ) suffix = HashEmbed( nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12, ) shape = HashEmbed( nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13, ) width_nI = sum( layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( uniqued( (lower | prefix | suffix | shape) >> Maxout( nO=width, nI=width_nI, normalize=True), column=cols.index(ORTH), )) if pretrained_vectors: static_vectors = StaticVectors(width) vector_layer = trained_vectors | static_vectors vectors_width = width * 2 else: vector_layer = trained_vectors vectors_width = width tok2vec = vector_layer >> with_array( Maxout(width, vectors_width, normalize=True) >> residual((expand_window(window_size=window_size) >> Maxout( nO=width, nI=width * ((window_size * 2) + 1), normalize=True)))**conv_depth, pad=conv_depth, ) cnn_model = (tok2vec >> list2ragged() >> ParametricAttention(width) >> reduce_sum() >> residual(Maxout(nO=width, nI=width)) >> Linear(nO=nO, nI=width) >> Dropout(0.0)) linear_model = build_bow_text_classifier( nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False, ) nO_double = nO * 2 if nO else None if exclusive_classes: output_layer = Softmax(nO=nO, nI=nO_double) else: output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: model.set_dim("nO", nO) model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.attrs["multi_label"] = not exclusive_classes return model
def test_chain_operator_two(model1, model2): with Model.define_operators({">>": chain}): model = model1 >> model2 assert len(model.layers) == 2
def test_all_operators(op): m1 = Linear() m2 = Linear() with Model.define_operators({op: lambda a, b: a.name + b.name}): if op == "+": value = m1 + m2 else: with pytest.raises(TypeError): value = m1 + m2 if op == "-": value = m1 - m2 else: with pytest.raises(TypeError): value = m1 - m2 if op == "*": value = m1 * m2 else: with pytest.raises(TypeError): value = m1 * m2 if op == "@": value = m1.__matmul__(m2) # Be kind to Python 2... else: with pytest.raises(TypeError): value = m1.__matmul__(m2) if op == "/": value = m1 / m2 else: with pytest.raises(TypeError): value = m1 / m2 if op == "//": value = m1 // m2 else: with pytest.raises(TypeError): value = m1 // m2 if op == "^": value = m1 ^ m2 else: with pytest.raises(TypeError): value = m1 ^ m2 if op == "%": value = m1 % m2 else: with pytest.raises(TypeError): value = m1 % m2 if op == "**": value = m1**m2 else: with pytest.raises(TypeError): value = m1**m2 if op == "<<": value = m1 << m2 else: with pytest.raises(TypeError): value = m1 << m2 if op == ">>": value = m1 >> m2 else: with pytest.raises(TypeError): value = m1 >> m2 if op == "&": value = m1 & m2 else: with pytest.raises(TypeError): value = m1 & m2 if op == "^": value = m1 ^ m2 else: with pytest.raises(TypeError): value = m1 ^ m2 if op == "|": value = m1 | m2 else: with pytest.raises(TypeError): value = m1 | m2 # noqa: F841 assert Model._context_operators.get() == {}
def test_concatenate_operator_two(model1, model2): with Model.define_operators({"|": concatenate}): model = model1 | model2 assert len(model.layers) == 2
def test_bind_plus(): with Model.define_operators({"+": lambda a, b: (a.name, b.name)}): m = create_model(name="a") + create_model(name="b") assert m == ("a", "b")
def test_concatenate_operator_three(model1, model2, model3): with Model.define_operators({"|": concatenate}): model = model1 | model2 | model3 assert len(model.layers) == 3
def test_tuplify_operator_two(model1, model2): with Model.define_operators({"&": tuplify}): model = model1 & model2 assert len(model.layers) == 2