def output_spec(self) -> Spec: ret = {"tokens": lit_types.Tokens()} ret["tokens_" + self.config.text_a_name] = lit_types.Tokens() if self.config.text_b_name: ret["tokens_" + self.config.text_b_name] = lit_types.Tokens() if self.is_regression: ret["score"] = lit_types.RegressionScore( parent=self.config.label_name) else: ret["probas"] = lit_types.MulticlassPreds( parent=self.config.label_name, vocab=self.config.labels, null_idx=self.config.null_label_idx) ret["cls_emb"] = lit_types.Embeddings() # Gradients, if requested. if self.config.compute_grads: ret["token_grad_" + self.config.text_a_name] = lit_types.TokenGradients( align="tokens_" + self.config.text_a_name) if self.config.text_b_name: ret["token_grad_" + self.config.text_b_name] = lit_types.TokenGradients( align="tokens_" + self.config.text_b_name) # Attention heads, one field for each layer. for i in range(self.model.config.num_hidden_layers): ret[f"layer_{i}/attention"] = lit_types.AttentionHeads( align=("tokens", "tokens")) return ret
def output_spec(self): spec = super().output_spec() # has 'output_text' spec.update({ "input_tokens": lit_types.Tokens(parent="input_text"), "encoder_final_embedding": lit_types.Embeddings(), # If target text is given, the following will also be populated. "target_tokens": lit_types.Tokens(parent="target_text"), "pred_tokens": lit_types.TokenTopKPreds(align="target_tokens"), }) if self.config.num_to_generate > 1: spec["output_text"] = lit_types.GeneratedTextCandidates( parent="target_text") if self.config.output_attention: # Add attention for each layer. for i in range(self.num_layers): spec[ f"encoder_layer_{i:d}_attention"] = lit_types.AttentionHeads( align_in="input_tokens", align_out="input_tokens") spec[ f"decoder_layer_{i:d}_attention"] = lit_types.AttentionHeads( align_in="target_tokens", align_out="target_tokens") return spec
def output_spec(self) -> Spec: ret = {"tokens": lit_types.Tokens()} ret["tokens_" + self.config.text_a_name] = lit_types.Tokens( parent=self.config.text_a_name) if self.config.text_b_name: ret["tokens_" + self.config.text_b_name] = lit_types.Tokens( parent=self.config.text_b_name) if self.is_regression: ret["score"] = lit_types.RegressionScore(parent=self.config.label_name) else: ret["probas"] = lit_types.MulticlassPreds( parent=self.config.label_name, vocab=self.config.labels, null_idx=self.config.null_label_idx) ret["cls_emb"] = lit_types.Embeddings() # Average embeddings, one per layer including embeddings. for i in range(1 + self.model.config.num_hidden_layers): ret[f"layer_{i}/avg_emb"] = lit_types.Embeddings() ret["cls_grad"] = lit_types.Gradients( grad_for="cls_emb", grad_target_field_key="grad_class") # The input_embs_ and grad_class fields are used for Integrated Gradients. ret["input_embs_" + self.config.text_a_name] = lit_types.TokenEmbeddings( align="tokens_" + self.config.text_a_name) if self.config.text_b_name: ret["input_embs_" + self.config.text_b_name] = lit_types.TokenEmbeddings( align="tokens_" + self.config.text_b_name) # Gradients, if requested. if self.config.compute_grads: ret["grad_class"] = lit_types.CategoryLabel(required=False, vocab=self.config.labels) ret["token_grad_" + self.config.text_a_name] = lit_types.TokenGradients( align="tokens_" + self.config.text_a_name, grad_for="input_embs_" + self.config.text_a_name, grad_target_field_key="grad_class") if self.config.text_b_name: ret["token_grad_" + self.config.text_b_name] = lit_types.TokenGradients( align="tokens_" + self.config.text_b_name, grad_for="input_embs_" + self.config.text_b_name, grad_target_field_key="grad_class") # Attention heads, one field for each layer. for i in range(self.model.config.num_hidden_layers): ret[f"layer_{i+1}/attention"] = lit_types.AttentionHeads( align_in="tokens", align_out="tokens") return ret
def spec(self): return { "text": lit_types.TextSegment(), "tokens": lit_types.Tokens(parent="text"), "coref": lit_types.EdgeLabels(align="tokens"), # Metadata fields for filtering and analysis. "occupation": lit_types.CategoryLabel(), "participant": lit_types.CategoryLabel(), "answer": lit_types.CategoryLabel(vocab=ANSWER_VOCAB), "someone": lit_types.CategoryLabel(vocab=["True", "False"]), "pronouns": lit_types.CategoryLabel(vocab=list(PRONOUNS_BY_GENDER.values())), "pronoun_type": lit_types.CategoryLabel(vocab=["NOM", "POSS", "ACC"]), "gender": lit_types.CategoryLabel(vocab=[g.name for g in Gender]), "pf_bls": lit_types.Scalar(), }
def test_find_spec_keys(self): spec = { "score": types.RegressionScore(), "scalar_foo": types.Scalar(), "text": types.TextSegment(), "emb_0": types.Embeddings(), "emb_1": types.Embeddings(), "tokens": types.Tokens(), "generated_text": types.GeneratedText(), } self.assertEqual(["score"], utils.find_spec_keys(spec, types.RegressionScore)) self.assertEqual(["text", "tokens", "generated_text"], utils.find_spec_keys(spec, (types.TextSegment, types.Tokens))) self.assertEqual(["emb_0", "emb_1"], utils.find_spec_keys(spec, types.Embeddings)) self.assertEqual([], utils.find_spec_keys(spec, types.AttentionHeads)) # Check subclasses self.assertEqual( list(spec.keys()), utils.find_spec_keys(spec, types.LitType)) self.assertEqual(["text", "generated_text"], utils.find_spec_keys(spec, types.TextSegment)) self.assertEqual(["score", "scalar_foo"], utils.find_spec_keys(spec, types.Scalar))
def output_spec(self) -> lit_types.Spec: return { "tokens": lit_types.Tokens(), "probas": lit_types.MulticlassPreds(parent="label", vocab=self.LABELS), "cls_emb": lit_types.Embeddings() }
def __init__(self, model, tasks): """Initialize with Stanza model and a dictionary of tasks. Args: model: A Stanza model tasks: A dictionary of tasks, grouped by task type. Keys are the grouping, which should be one of: ('sequence', 'span', 'edge'). Values are a list of stanza task names as strings. """ self.model = model # Store lists of task name strings by grouping self.sequence_tasks = tasks["sequence"] self.span_tasks = tasks["span"] self.edge_tasks = tasks["edge"] self._input_spec = { "sentence": lit_types.TextSegment(), } self._output_spec = { "tokens": lit_types.Tokens(), } # Output spec based on specified tasks for task in self.sequence_tasks: self._output_spec[task] = lit_types.SequenceTags(align="tokens") for task in self.span_tasks: self._output_spec[task] = lit_types.SpanLabels(align="tokens") for task in self.edge_tasks: self._output_spec[task] = lit_types.EdgeLabels(align="tokens")
def output_spec(self) -> lit_types.Spec: return { "tokens": lit_types.Tokens(), "probas": lit_types.MulticlassPreds(parent="label", vocab=self._labels), "cls_emb": lit_types.Embeddings(), "token_grad_sentence": lit_types.TokenGradients(align="tokens") }
def output_spec(self) -> lit_types.Spec: return { "tokens": lit_types.Tokens(), "logits": lit_types.RegressionScore(), "cls_emb": lit_types.Embeddings(), "token_grad_sentence": lit_types.TokenGradients(align="tokens") }
def output_spec(self): return { 'top_layer_embs': lit_types.TokenEmbeddings(), 'wpm_tokens': lit_types.Tokens(), 'offsets': lit_types.SubwordOffsets(align_in='tokens', align_out='wpm_tokens') }
def output_spec(self): spec = { "input_tokens": lit_types.Tokens(parent="input_text"), "generation": lit_types.GeneratedText(parent="target_text"), "encoder_final_embedding": lit_types.Embeddings(), # If target text is given, the following will also be populated. "target_tokens": lit_types.Tokens(parent="target_text"), "pred_tokens": lit_types.TokenTopKPreds(align="target_tokens"), "rougeL": lit_types.Scalar(), } if self.config.output_attention: # Add attention for each layer. for i in range(self.num_layers): spec[f"encoder_layer_{i:d}_attention"] = lit_types.AttentionHeads( align=("input_tokens", "input_tokens")) spec[f"decoder_layer_{i:d}_attention"] = lit_types.AttentionHeads( align=("target_tokens", "target_tokens")) return spec
def output_spec(self) -> lit_types.Spec: return { "src_tokens": lit_types.Tokens(parent="src_text"), "trg_text": lit_types.GeneratedText(parent="ref_text"), "trg_tokens": lit_types.Tokens(parent="trg_text"), "attention": lit_types.AttentionHeads(align_in="src_tokens", align_out="trg_tokens"), "pred_tokens": lit_types.TokenTopKPreds(align="trg_tokens", parent="trg_text"), "encoder_final_embedding": lit_types.Embeddings(), "ter": lit_types.Scalar(), "chrf3": lit_types.Scalar(), }
def spec(self) -> lit_types.Spec: """Should match MLM's input_spec().""" return { 'input_text': lit_types.TextSegment(), 'target_text': lit_types.TextSegment(), 'input_tokens': lit_types.Tokens(required=False), 'gece_tags': lit_types.SequenceTags(align='input_tokens', required=False) }
def output_spec(self): spec = { # the "parent" keyword tells LIT which field in the input spec we should # compare this to when computing metrics. "pred_tokens": lit_types.TokenTopKPreds(align="tokens"), "tokens": lit_types.Tokens(parent="text"), # all tokens } # Add attention and embeddings from each layer. for i in range(self.num_layers): spec[f"layer_{i:d}_attention"] = lit_types.AttentionHeads( align_in="tokens", align_out="tokens") spec[f"layer_{i:d}_avg_embedding"] = lit_types.Embeddings() return spec
def output_spec(self): return {'probas': lit_types.MulticlassPreds( parent='label', vocab=['0', '1'], null_idx=0), 'input_embs': lit_types.TokenEmbeddings(align='tokens'), 'input_embs_grad': lit_types.TokenGradients(align='tokens', grad_for='input_embs', grad_target='grad_class' ), 'tokens': lit_types.Tokens(), 'grad_class': lit_types.CategoryLabel(vocab=['0', '1']) }
def output_spec(self): # TODO(lit-dev): also return the embeddings for each span on datasets # with a fixed number of targets; for Winogender this would be # {occupation, other participant, pronoun} return { 'tokens': lit_types.Tokens(parent='text'), 'coref': lit_types.EdgeLabels(align='tokens'), 'pred_answer': lit_types.MulticlassPreds(vocab=winogender.ANSWER_VOCAB, parent='answer'), }
def input_spec(self) -> Spec: ret = {} ret[self.config.text_a_name] = lit_types.TextSegment() ret["tokens_" + self.config.text_a_name] = lit_types.Tokens( parent=self.config.text_a_name, required=False) if self.config.text_b_name: ret[self.config.text_b_name] = lit_types.TextSegment() ret["tokens_" + self.config.text_b_name] = lit_types.Tokens( parent=self.config.text_b_name, required=False) if self.is_regression: ret[self.config.label_name] = lit_types.RegressionScore(required=False) else: ret[self.config.label_name] = lit_types.CategoryLabel( required=False, vocab=self.config.labels) # The input_embs_ and grad_class fields are used for Integrated Gradients. ret["input_embs_" + self.config.text_a_name] = lit_types.TokenEmbeddings( align="tokens", required=False) if self.config.text_b_name: ret["input_embs_" + self.config.text_b_name] = lit_types.TokenEmbeddings( align="tokens", required=False) ret["grad_class"] = lit_types.CategoryLabel(required=False, vocab=self.config.labels) return ret
def output_spec(self) -> lit_types.Spec: output = { "input_tokens": lit_types.Tokens(parent="input_text"), "predicted": lit_types.GeneratedText(parent='target_text'), 'layer_average': lit_types.AttentionHeads(align=('input_tokens', 'input_tokens')) } for layer in range(self.ATTENTION_LAYERS): output['layer{}'.format(layer)] = lit_types.AttentionHeads( align=('input_tokens', 'input_tokens')) return output
def test_compatibility_optionals(self): """Test with optionals in the model spec.""" mspec = model.ModelSpec(input={ "text": types.TextSegment(), "tokens": types.Tokens(parent="text", required=False), "label": types.CategoryLabel(vocab=["0", "1"], required=False), }, output={}) dspec = { "text": types.TextSegment(), "label": types.CategoryLabel(vocab=["0", "1"]), } self.assertTrue(mspec.is_compatible_with_dataset(dspec))
def output_spec(self) -> lit_types.Spec: """Give the output specifications.""" ret = { "tokens": lit_types.Tokens(), "probas": lit_types.MulticlassPreds(parent="label", vocab=self.LABELS), "cls_emb": lit_types.Embeddings() } # Gradients, if requested. if self.compute_grads: ret["token_grad_sentence"] = lit_types.TokenGradients(align="tokens") # Attention heads, one field for each layer. for i in range(self.model.config.num_hidden_layers): ret[f"layer_{i}/attention"] = lit_types.AttentionHeads(align=("tokens", "tokens")) return ret
def test_compatibility_optionals_mismatch(self): """Test with optionals that don't match metadata.""" mspec = model.ModelSpec(input={ "text": types.TextSegment(), "tokens": types.Tokens(parent="text", required=False), "label": types.CategoryLabel(vocab=["0", "1"], required=False), }, output={}) dspec = { "text": types.TextSegment(), # This label field doesn't match the one the model expects. "label": types.CategoryLabel(vocab=["foo", "bar"]), } self.assertFalse(mspec.is_compatible_with_dataset(dspec))
def output_spec(self): # TODO(lit-dev): also return the embeddings for each span on datasets # with a fixed number of targets; for Winogender this would be # {occupation, other participant, pronoun} return { 'tokens': lit_types.Tokens(parent='text'), 'coref': lit_types.EdgeLabels(align='tokens'), 'pred_answer': lit_types.MulticlassPreds(vocab=winogender.ANSWER_VOCAB, parent='answer'), # TODO(b/172975096): allow plotting of scalars from input data, # so we don't need to add this to the predictions. 'pf_bls': lit_types.Scalar(), }
def input_spec(self): return { 'text': lit_types.TextSegment(), 'tokens': lit_types.Tokens(parent='text'), 'coref': lit_types.EdgeLabels(align='tokens'), # Index of predicted (single) edge for Winogender 'answer': lit_types.CategoryLabel(vocab=winogender.ANSWER_VOCAB, required=False), # TODO(b/172975096): allow plotting of scalars from input data, # so we don't need to add this to the predictions. 'pf_bls': lit_types.Scalar(required=False), }
def output_spec(self) -> lit_types.Spec: spec = { "tokens": lit_types.Tokens(), "bio_tags": lit_types.SequenceTags(align="tokens"), "token_ids": lit_types.SequenceTags(align="tokens"), "grads": lit_types.TokenGradients(align="tokens"), "probas": lit_types.MulticlassPreds(parent="bio_tags", vocab=self.LABELS) } for i in range(self.model.config.num_hidden_layers): spec[f'layer_{i}/attention'] = lit_types.AttentionHeads( align=("tokens", "tokens")) return spec
def config_spec(self) -> types.Spec: return { NUM_EXAMPLES_KEY: types.TextSegment(default=str(NUM_EXAMPLES_DEFAULT)), MAX_FLIPS_KEY: types.TextSegment(default=str(MAX_FLIPS_DEFAULT)), TOKENS_TO_IGNORE_KEY: types.Tokens(default=TOKENS_TO_IGNORE_DEFAULT), PREDICTION_KEY: types.FieldMatcher(spec="output", types=["MulticlassPreds", "RegressionScore"]), REGRESSION_THRESH_KEY: types.TextSegment(default=str(REGRESSION_THRESH_DEFAULT)), FIELDS_TO_HOTFLIP_KEY: types.MultiFieldMatcher(spec="input", types=["Tokens"], select_all=True), }
def input_spec(self): return { "text": lit_types.TextSegment(), "tokens": lit_types.Tokens(required=False), }
def input_spec(self): return {'tokens': lit_types.Tokens()}
def input_spec(self): return { "text": lit_types.TextSegment(), "tokens": lit_types.Tokens(mask_token="[MASK]", required=False), }
def output_spec(self): return { "tokens": lit_types.Tokens(parent="text"), "pred_tokens": lit_types.TokenTopKPreds(align="tokens"), "cls_emb": lit_types.Embeddings(), }
def spec(self): return { 'text': lit_types.TextSegment(), 'tokens': lit_types.Tokens(parent='text'), 'coref': lit_types.EdgeLabels(align='tokens'), }