def test_create_label_tensors(self): tensorizer = LabelTensorizer(column="label") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() rows = [ { "label": types.Label("weather/find") }, { "label": types.Label("alarm/set_alarm") }, { "label": types.Label("non/existent") }, ] tensors = (tensorizer.numberize(row) for row in rows) tensor = next(tensors) self.assertEqual(6, tensor) tensor = next(tensors) self.assertEqual(1, tensor) with self.assertRaises(Exception): tensor = next(tensors)
def test_initialize_label_tensorizer(self): tensorizer = LabelTensorizer(column="label") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() self.assertEqual(7, len(tensorizer.labels))
def test_create_word_tagging_output_layer(self): tensorizer = LabelTensorizer() tensorizer.vocab = Vocabulary(["foo", "bar"]) tensorizer.pad_idx = 0 layer = WordTaggingOutputLayer.from_config( config=WordTaggingOutputLayer.Config(label_weights={"foo": 2.2}), labels=tensorizer.vocab, ) np.testing.assert_array_almost_equal( np.array([2.2, 1]), layer.loss_fn.weight.detach().numpy() )
def test_create_label_tensors_fails_with_unknown_label(self): tensorizer = LabelTensorizer(column="label") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() batch = [ {"label": types.Label("non/existent")}, {"label": types.Label("alarm/set_alarm")}, ] with self.assertRaises(Exception): tensorizer.create_training_tensors(batch)
def test_create_label_tensors(self): tensorizer = LabelTensorizer(column="label") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() batch = [ {"label": types.Label("weather/find")}, {"label": types.Label("alarm/set_alarm")}, ] tensor = tensorizer.create_training_tensors(batch) self.assertEqual((2,), tensor.size()) self.assertEqual([6, 1], tensor.tolist())
class InputConfig(ConfigBase): right_tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config() left_tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config() right_dense: Optional[FloatListTensorizer.Config] = None left_dense: Optional[FloatListTensorizer.Config] = None labels: LabelTensorizer.Config = LabelTensorizer.Config()
class EncoderModelInput(BaseModel.Config.ModelInput): tokens: Tensorizer.Config = Tensorizer.Config() dense: Optional[FloatListTensorizer.Config] = None labels: LabelTensorizer.Config = LabelTensorizer.Config() # for metric reporter num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config( names=["tokens"], indexes=[2])
class EncoderPairwiseModelInput(ModelInputBase): tokens1: Tensorizer.Config = Tensorizer.Config() tokens2: Tensorizer.Config = Tensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config() # for metric reporter num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config( names=["tokens1", "tokens2"], indexes=[2, 2])
def test_doc_classification_output_layer(self): tensorizer = LabelTensorizer() tensorizer.vocab = Vocabulary([SpecialTokens.PAD, "foo", "bar"]) layer = ClassificationOutputLayer.from_config( config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()), labels=tensorizer.vocab, ) self.assertEqual(layer.loss_fn.ignore_index, 0) # use default pad tensorizer.vocab = Vocabulary(["foo", "bar"]) layer = ClassificationOutputLayer.from_config( config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()), labels=tensorizer.vocab, ) self.assertEqual(layer.loss_fn.ignore_index, -1)
def test_batch_predict_caffe2_model(self): with tempfile.NamedTemporaryFile() as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( model=DocModel.Config( inputs=DocModel.Config.ModelInput( tokens=TokenTensorizer.Config(), dense=FloatListTensorizer.Config( column="dense", dim=1, error_check=True ), labels=LabelTensorizer.Config(), ) ), data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, test_filename=eval_data, field_names=["label", "slots", "text", "dense"], ) ), ), version=21, save_snapshot_path=snapshot_file.name, export_caffe2_path=caffe2_model_file.name, ) task = create_task(config.task) task.export(task.model, caffe2_model_file.name) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) pt_results = task.predict(task.data.data_source.test) def assert_caffe2_results_correct(caffe2_results): for pt_res, res in zip(pt_results, caffe2_results): np.testing.assert_array_almost_equal( pt_res["score"].tolist()[0], [score[0] for score in res.values()], ) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=2 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=-1 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results)
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() word_labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config( allow_unknown=True) doc_labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) doc_weight: Optional[FloatTensorizer.Config] = None word_weight: Optional[FloatTensorizer.Config] = None
class BertModelInput(BaseModel.Config.ModelInput): tokens: BERTTensorizer.Config = BERTTensorizer.Config(max_seq_len=128) dense: Optional[FloatListTensorizer.Config] = None labels: LabelTensorizer.Config = LabelTensorizer.Config() # for metric reporter num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config( names=["tokens"], indexes=[2] )
class ModelInput(BasePairwiseModel.Config.ModelInput): tokens1: TokenTensorizer.Config = TokenTensorizer.Config(column="text1") tokens2: TokenTensorizer.Config = TokenTensorizer.Config(column="text2") labels: LabelTensorizer.Config = LabelTensorizer.Config() # for metric reporter raw_text: JoinStringTensorizer.Config = JoinStringTensorizer.Config( columns=["text1", "text2"] )
def test_create_label_tensors(self): tensorizer = LabelTensorizer(label_column="label") self._initialize_tensorizer(tensorizer) rows = [ {"label": "weather/find"}, {"label": "alarm/set_alarm"}, {"label": "non/existent"}, ] tensors = (tensorizer.numberize(row) for row in rows) tensor = next(tensors) self.assertEqual(6, tensor) tensor = next(tensors) self.assertEqual(1, tensor) with self.assertRaises(Exception): tensor = next(tensors)
class ModelInput(BaseModel.Config.ModelInput): squad_input: Union[ SquadForBERTTensorizer.Config, SquadForRoBERTaTensorizer.Config ] = SquadForBERTTensorizer.Config(max_seq_len=256) # is_impossible label has_answer: LabelTensorizer.Config = LabelTensorizer.Config( column="has_answer" )
def test_initialize_tensorizers(self): tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label"), "chars": CharacterTensorizer(column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].labels))
def test_initialize_tensorizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), "chars": ByteTensorizer(text_column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
class ModelInput(ModelInputBase): tokens1: BERTTensorizerBase.Config = BERTTensorizer.Config( columns=["text1"], max_seq_len=128) tokens2: BERTTensorizerBase.Config = BERTTensorizer.Config( columns=["text2"], max_seq_len=128) labels: LabelTensorizer.Config = LabelTensorizer.Config() # for metric reporter num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config( names=["tokens1", "tokens2"], indexes=[2, 2])
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() word_labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config( allow_unknown=True) doc_labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) doc_weight: FloatTensorizer.Config = FloatTensorizer.Config( column="doc_weight") word_weight: FloatTensorizer.Config = FloatTensorizer.Config( column="word_weight")
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), } # verify TokenTensorizer isn't in an initialized state yet assert tensorizers["tokens"].vocab is None Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label"), } with self.assertRaises(AttributeError): # verify WordTensorizer isn't in an initialized state yet tensorizers["tokens"].vocab Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].labels))
def setUp(self): self.data_source = TSVDataSource( SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")), SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense"], schema={"text": types.Text, "label": types.Label}, ) self.tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label", allow_unknown=True), }
class ModelInput(BasePairwiseModel.Config.ModelInput): tokens1: TokenTensorizer.Config = TokenTensorizer.Config( column="text1") tokens2: TokenTensorizer.Config = TokenTensorizer.Config( column="text2") labels: LabelTensorizer.Config = LabelTensorizer.Config()
class InputConfig(ConfigBase): tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config() right_dense: FloatListTensorizer.Config = None left_dense: FloatListTensorizer.Config = None labels: LabelTensorizer.Config = LabelTensorizer.Config()
class ModelInput(Model.Config.ModelInput): tokens: WordTensorizer.Config = WordTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) # for metric reporter raw_text: MetaInput.Config = MetaInput.Config(column="text")
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config()
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() dense: Optional[FloatListTensorizer.Config] = None labels: LabelTensorizer.Config = LabelTensorizer.Config()
def test_initialize_label_tensorizer(self): tensorizer = LabelTensorizer(label_column="label") self._initialize_tensorizer(tensorizer) self.assertEqual(7, len(tensorizer.vocab))
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True) # for metric reporter raw_text: RawString.Config = RawString.Config(column="text")
class InputConfig(ConfigBase): tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config()