def test_fit_predict_multi_model(self): """ Ensure model training does not error out Ensure model returns predictions """ self.model = SequenceLabeler(batch_size=2, max_length=256, lm_loss_coef=0.0, multi_label_sequences=True) raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, _ = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) model.predict(test_texts)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model_config.update( dict( # general params that differ from finetune base_model=RoBERTa, batch_size=4, predict_batch_size=10, val_size=0.0, crf_sequence_labeling=False, low_memory_mode=True, class_weights="log", # auxiliary-specific params use_auxiliary_info=True, context_dim=4, default_context={ 'left': 0, 'right': 0, 'top': 0, 'bottom': 0, }, n_context_embed_per_channel=48, context_in_base_model=True, n_layers_with_aux=-1) ) self.model_config.update(kwargs) self.model = SequenceLabeler(**self.model_config)
def setUp(self): self.save_file = 'tests/saved-models/test-save-load' random.seed(42) np.random.seed(42) with open(self.processed_path, 'rt') as fp: self.texts, self.labels = json.load(fp) self.model = SequenceLabeler(**self.default_config())
def setUp(self): self.save_file = 'tests/saved-models/test-save-load' with open(self.processed_path, 'rt') as fp: self.texts, self.labels = json.load(fp) tf.reset_default_graph() self.model = SequenceLabeler(batch_size=2, max_length=256, verbose=False)
def test_sequence_labeler_auxiliary(self): """ Ensure model training does not error out Ensure model returns reasonable predictions """ # here we want to make sure we're actually using context model = SequenceLabeler(**self.default_config(n_epochs=1500)) model.fit(self.trainX, self.trainY_seq, context=self.train_context) preds = model.predict(self.trainX, context=self.train_context) self._evaluate_sequence_preds(preds, includes_context=True)
def test_sequence_labeler_no_auxiliary(self): """ Ensure model training does not error out Ensure model returns reasonable predictions """ model = SequenceLabeler(**self.default_config( use_auxiliary_info=False, val_set=(self.trainX, self.trainY))) model.fit(self.trainX, self.trainY_seq) preds = model.predict(self.trainX) self._evaluate_sequence_preds(preds, includes_context=False)
class FinetuneSequenceLabel(ClassificationExperiment): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model = SequenceLabeler(val_size=0) def fit(self, X, y): self.model.fit(X, y) def predict(self, X, **kwargs): return self.model.predict(X)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model_config = dict( use_auxiliary_info = False, n_layers_with_aux = 0, context_in_base_model = False, context_dim = 0 ) self.model_config.update(kwargs) self.model = SequenceLabeler(**self.model_config)
def setUpClass(cls): cls._download_data() #dataset preparation cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10) path = os.path.join(os.path.dirname(__file__), "data", "testdata.json") with open(path, 'rt') as fp: cls.texts, cls.labels = json.load(fp) cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"] cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] #train and save sequence labeler for later use try: cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls)) except FileNotFoundError: cls.s = SequenceLabeler(**cls.default_seq_config(cls)) cls.s.fit(cls.texts * 10, cls.labels * 10) cls.s.save(cls.sequence_labeler_path) #train and save classifier for later use train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10) try: cls.cl = Classifier.load(cls.classifier_path) except FileNotFoundError: cls.cl = Classifier(**cls.default_config(cls)) cls.cl.fit(train_sample.Text, train_sample.Target) cls.cl.save(cls.classifier_path) if cls.do_comparison: #train and save comparison regressor for use cls.cr = ComparisonRegressor() n_per = 150 similar = [] different = [] for dataset in [cls.animals, cls.numbers]: for i in range(n_per // 2): similar.append([random.choice(dataset), random.choice(dataset)]) for i in range(n_per): different.append([random.choice(cls.animals), random.choice(cls.numbers)]) targets = np.asarray([1] * len(similar) + [0] * len(different)) data = similar + different cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42) try: cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls)) except FileNotFoundError: cls.cr = ComparisonRegressor(**cls.default_config(cls)) cls.cr.fit(cls.x_tr, cls.t_tr) cls.cr.save(cls.comparison_regressor_path)
def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions Ensure class reweighting behaves as intended """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token ) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1 ) reweighted_model = SequenceLabeler( **self.default_config(class_weights={"Named Entity": 100.0}) ) reweighted_model.fit(train_texts, train_annotations) reweighted_predictions = reweighted_model.predict(test_texts) reweighted_token_recall = sequence_labeling_token_recall( test_annotations, reweighted_predictions ) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]["confidence"], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions ) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions ) overlap_recall = sequence_labeling_overlap_recall(test_annotations, predictions) self.assertIn("Named Entity", token_precision) self.assertIn("Named Entity", token_recall) self.assertIn("Named Entity", overlap_precision) self.assertIn("Named Entity", overlap_recall) self.model.save(self.save_file) self.assertGreater( reweighted_token_recall["Named Entity"], token_recall["Named Entity"] )
def test_train_test_tokenization_consistency(self): filepath = os.path.abspath( os.path.join(os.path.dirname(__file__), 'data', 'testdata.csv')) df = pd.read_csv(filepath) X = [] Y = [] for i, row in df.iterrows(): X.append(row["text"]) labels = json.loads(row["question_843"]) for label in labels: label['start'] = label['startOffset'] label['end'] = label['endOffset'] label['text'] = row["text"][label['start']:label['end']] Y.append(labels) for multilabel_setting in [True, False]: for base_model in [GPT, GPT2, BERT]: model = SequenceLabeler( chunk_long_sequences=True, base_model=base_model, multi_label_sequences=multilabel_setting) train_encoded = [ x for x in model.input_pipeline._text_to_ids( X, Y=Y, pad_token=model.config.pad_token) ] test_encoded = [ x for x in model.input_pipeline._text_to_ids(X) ] for chunk_id in range(len(train_encoded)): for train_token_ids, test_token_ids in zip( train_encoded[chunk_id].token_ids, test_encoded[chunk_id].token_ids): self.assertEqual(train_token_ids[0], test_token_ids[0])
def test_fit_lm_only(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall( test_annotations, predictions) self.assertIn('Named Entity', token_precision) self.assertIn('Named Entity', token_recall) self.assertIn('Named Entity', overlap_precision) self.assertIn('Named Entity', overlap_recall) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts)
def test_auxiliary_sequence_labeler(self): """ Ensure model training does not error out Ensure model returns reasonable predictions """ (trainX, testX, trainY, testY) = self.dataset model = SequenceLabeler(**self.default_config()) model.fit(trainX, trainY) preds = model.predict(testX) token_precision = sequence_labeling_token_precision(preds, testY) token_recall = sequence_labeling_token_recall(preds, testY) self.assertIn("Named Entity", token_precision) self.assertIn("Named Entity", token_recall) token_precision = np.mean(list(token_precision.values())) token_recall = np.mean(list(token_recall.values())) self.assertGreater(token_precision, 0.6) self.assertGreater(token_recall, 0.6)
class RoBERTaSeqLab(SidekickSeqLab): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model_config = dict( use_auxiliary_info = False, n_layers_with_aux = 0, context_in_base_model = False, context_dim = 0 ) self.model_config.update(kwargs) self.model = SequenceLabeler(**self.model_config) def fit(self, X, y): text, context = zip(*X) self.model.fit(text, y) def predict(self, X, **kwargs): text, context = zip(*X) return self.model.predict(text)
class FinetuneSeqBaselineRationalized(ClassificationExperiment): param_grid = {} def __init__(self, *args, **kwargs): """Initialize internal classifier.""" super().__init__(auto_resample=False, *args, **kwargs) self.model = SequenceLabeler(val_size=0) def fit(self, X, y): targets = [] for x, l in zip(X, y): if l[0]: targets.append([{**label, "label": l[1]} for label in l[0]]) else: targets.append([{ "start": 0, "end": len(x), "label": l[1], "text": x }]) idxs, _ = self.resample(list(range(len(X))), [yi[1] for yi in y]) train_x = [] train_y = [] for i in idxs: train_x.append(X[i]) train_y.append(targets[i]) self.model.fit(train_x, train_y) def predict(self, X, **kwargs): preds = self.model.predict_proba(X) classes = self.model.input_pipeline.label_encoder.classes_[:] classes.remove("<PAD>") output = [] for sample in preds: output.append({ k: safe_mean([s["confidence"][k] for s in sample]) + 1e-10 for k in classes }) return pd.DataFrame.from_records(output) def cleanup(self): del self.model
def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence(raw_docs, self.texts, self.labels) train_texts, test_texts, train_annotations, test_annotations = train_test_split(texts, annotations) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model_config.update(dict( pos_injection=True, n_layers_with_aux = 0, context_in_base_model = False )) self.model_config.update(kwargs) self.model = SequenceLabeler(**self.model_config)
class TestSequenceLabeler(unittest.TestCase): n_sample = 100 n_hidden = 768 dataset_path = os.path.join('Data', 'Sequence', 'reuters.xml') processed_path = os.path.join('Data', 'Sequence', 'reuters.json') @classmethod def _download_reuters(cls): """ Download Stanford Sentiment Treebank to enso `data` directory """ path = Path(cls.dataset_path) if not path.exists(): path.parent.mkdir(parents=True, exist_ok=True) if not os.path.exists(cls.dataset_path): url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml" r = requests.get(url) with open(cls.dataset_path, "wb") as fp: fp.write(r.content) with codecs.open(cls.dataset_path, "r", "utf-8") as infile: soup = bs(infile, "html5lib") docs = [] docs_labels = [] for elem in soup.find_all("document"): texts = [] labels = [] # Loop through each child of the element under "textwithnamedentities" for c in elem.find("textwithnamedentities").children: if type(c) == Tag: if c.name == "namedentityintext": label = "Named Entity" # part of a named entity else: label = "<PAD>" # irrelevant word texts.append(c.text) labels.append(label) docs.append(texts) docs_labels.append(labels) with open(cls.processed_path, 'wt') as fp: json.dump((docs, docs_labels), fp) @classmethod def setUpClass(cls): cls._download_reuters() def setUp(self): self.save_file = 'tests/saved-models/test-save-load' with open(self.processed_path, 'rt') as fp: self.texts, self.labels = json.load(fp) tf.reset_default_graph() self.model = SequenceLabeler(batch_size=2, max_length=256, lm_loss_coef=0.0, verbose=False) def test_fit_lm_only(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall( test_annotations, predictions) self.assertIn('Named Entity', token_precision) self.assertIn('Named Entity', token_recall) self.assertIn('Named Entity', overlap_precision) self.assertIn('Named Entity', overlap_recall) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts) def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall( test_annotations, predictions) self.assertIn('Named Entity', token_precision) self.assertIn('Named Entity', token_recall) self.assertIn('Named Entity', overlap_precision) self.assertIn('Named Entity', overlap_recall) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts) def test_reasonable_predictions(self): test_sequence = [ "I am a dog. A dog that's incredibly bright. I can talk, read, and write!" ] path = os.path.join(os.path.dirname(__file__), "testdata.json") # test ValueError raised when raw text is passed along with character idxs and doesn't match with self.assertRaises(ValueError): self.model.fit(["Text about a dog."], [[{ "start": 0, "end": 5, "text": "cat", "label": "dog" }]]) with open(path, "rt") as fp: text, labels = json.load(fp) self.model.finetune(text * 10, labels * 10) predictions = self.model.predict(test_sequence) self.assertTrue(1 <= len(predictions[0]) <= 3) self.assertTrue(any(pred["text"] == "dog" for pred in predictions[0])) self.model.config.subtoken_predictions = True predictions = self.model.predict(test_sequence) self.assertTrue(1 <= len(predictions[0]) <= 3) self.assertTrue(any(pred["text"] == "dog" for pred in predictions[0])) def test_chunk_long_sequences(self): test_sequence = [ "I am a dog. A dog that's incredibly bright. I can talk, read, and write!" * 10 ] path = os.path.join(os.path.dirname(__file__), "testdata.json") # test ValueError raised when raw text is passed along with character idxs and doesn't match self.model.config.chunk_long_sequences = True self.model.config.max_length = 18 with self.assertRaises(ValueError): self.model.fit(["Text about a dog."], [[{ "start": 0, "end": 5, "text": "cat", "label": "dog" }]]) with open(path, "rt") as fp: text, labels = json.load(fp) self.model.finetune(text * 10, labels * 10) predictions = self.model.predict(test_sequence) print(test_sequence) print(predictions) print(len(predictions)) self.assertEqual(len(predictions[0]), 20) self.assertTrue(any(pred["text"] == "dog" for pred in predictions[0]))
def __init__(self, *args, **kwargs): """Initialize internal classifier.""" super().__init__(auto_resample=False, *args, **kwargs) self.model = SequenceLabeler(val_size=0)
os.remove(XML_PATH) raw_texts = ["".join(doc) for doc in docs] texts, annotations = finetune_to_indico_sequence( raw_texts, docs, docs_labels) df = pd.DataFrame({ 'texts': texts, 'annotations': [json.dumps(annotation) for annotation in annotations] }) df.to_csv(DATA_PATH) if __name__ == "__main__": dataset = Reuters(nrows=1000).dataframe dataset['annotations'] = [ json.loads(annotation) for annotation in dataset['annotations'] ] trainX, testX, trainY, testY = train_test_split(dataset.texts.values, dataset.annotations.values, test_size=0.3, random_state=42) model = SequenceLabeler(verbose=False, max_length=64, chunk_long_sequences=True) model.fit(trainX, trainY) predictions = model.predict(testX) n_sample = 10 for i in range(n_sample): print(testX[i], predictions[i])
class TestSequenceLabeler(unittest.TestCase): n_sample = 100 dataset_path = os.path.join('Data', 'Sequence', 'reuters.xml') processed_path = os.path.join('Data', 'Sequence', 'reuters.json') @classmethod def _download_reuters(cls): """ Download Stanford Sentiment Treebank to enso `data` directory """ path = Path(cls.dataset_path) if not path.exists(): path.parent.mkdir(parents=True, exist_ok=True) if not os.path.exists(cls.dataset_path): url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml" r = requests.get(url) with open(cls.dataset_path, "wb") as fp: fp.write(r.content) with codecs.open(cls.dataset_path, "r", "utf-8") as infile: soup = bs(infile, "html.parser") docs = [] docs_labels = [] for elem in soup.find_all("document"): texts = [] labels = [] # Loop through each child of the element under "textwithnamedentities" for c in elem.find("textwithnamedentities").children: if type(c) == Tag: if c.name == "namedentityintext": label = "Named Entity" # part of a named entity else: label = "<PAD>" # irrelevant word texts.append(c.text) labels.append(label) docs.append(texts) docs_labels.append(labels) with open(cls.processed_path, 'wt') as fp: json.dump((docs, docs_labels), fp) @classmethod def setUpClass(cls): cls._download_reuters() def default_config(self, **kwargs): d = dict( batch_size=2, max_length=256, lm_loss_coef=0.0, val_size=0, interpolate_pos_embed=False, ) d.update(**kwargs) return d def setUp(self): self.save_file = 'tests/saved-models/test-save-load' random.seed(42) np.random.seed(42) with open(self.processed_path, 'rt') as fp: self.texts, self.labels = json.load(fp) self.model = SequenceLabeler(**self.default_config()) @pytest.mark.skipif( SKIP_LM_TESTS, reason="Bidirectional models do not yet support LM functions") def test_fit_lm_only(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall( test_annotations, predictions) self.assertIn('Named Entity', token_precision) self.assertIn('Named Entity', token_recall) self.assertIn('Named Entity', overlap_precision) self.assertIn('Named Entity', overlap_recall) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts) def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions Ensure class reweighting behaves as intended """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1, random_state=42) reweighted_model = SequenceLabeler(**self.default_config( class_weights={'Named Entity': 10.})) reweighted_model.fit(train_texts, train_annotations) reweighted_predictions = reweighted_model.predict(test_texts) reweighted_token_recall = sequence_labeling_token_recall( test_annotations, reweighted_predictions) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall( test_annotations, predictions) self.assertIn('Named Entity', token_precision) self.assertIn('Named Entity', token_recall) self.assertIn('Named Entity', overlap_precision) self.assertIn('Named Entity', overlap_recall) self.model.save(self.save_file) self.assertGreater(reweighted_token_recall['Named Entity'], token_recall['Named Entity']) def test_cached_predict(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, _ = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) self.model.config.chunk_long_sequences = True self.model.config.max_length = 128 uncached_preds = self.model.predict(test_texts[:1]) with self.model.cached_predict(): start = time.time() self.model.predict(test_texts[:1]) first = time.time() self.model.predict(test_texts[:1]) second = time.time() preds = self.model.predict(test_texts[:1]) assert len(preds) == 1 preds = self.model.predict(test_texts[:2]) assert len(preds) == 2 for uncached_pred, cached_pred in zip(uncached_preds, preds): self.assertEqual(str(uncached_pred), str(cached_pred)) first_prediction_time = (first - start) second_prediction_time = (second - first) self.assertLess(second_prediction_time, first_prediction_time / 2.) def test_reasonable_predictions(self): test_sequence = [ "I am a dog. A dog that's incredibly bright. I can talk, read, and write!" ] path = os.path.join(os.path.dirname(__file__), "testdata.json") # test ValueError raised when raw text is passed along with character idxs and doesn't match with self.assertRaises(ValueError): self.model.fit(["Text about a dog."], [[{ "start": 0, "end": 5, "text": "cat", "label": "dog" }]]) with open(path, "rt") as fp: text, labels = json.load(fp) self.model.fit(text * 10, labels * 10) predictions = self.model.predict(test_sequence) self.assertTrue(1 <= len(predictions[0]) <= 3) self.assertTrue( any(pred["text"].strip() == "dog" for pred in predictions[0])) predictions = self.model.predict(test_sequence) self.assertTrue(1 <= len(predictions[0]) <= 3) self.assertTrue( any(pred["text"].strip() == "dog" for pred in predictions[0])) def test_chunk_long_sequences(self): test_sequence = [ "I am a dog. A dog that's incredibly bright. I can talk, read, and write! " * 10 ] path = os.path.join(os.path.dirname(__file__), "testdata.json") # test ValueError raised when raw text is passed along with character idxs and doesn't match self.model.config.chunk_long_sequences = True self.model.config.max_length = 18 with self.assertRaises(ValueError): self.model.fit(["Text about a dog."], [[{ "start": 0, "end": 5, "text": "cat", "label": "dog" }]]) with open(path, "rt") as fp: text, labels = json.load(fp) self.model.finetune(text * 10, labels * 10) predictions = self.model.predict(test_sequence) self.assertEqual(len(predictions[0]), 20) self.assertTrue( any(pred["text"].strip() == "dog" for pred in predictions[0])) def test_fit_predict_multi_model(self): """ Ensure model training does not error out Ensure model returns predictions """ self.model = SequenceLabeler(batch_size=2, max_length=256, lm_loss_coef=0.0, multi_label_sequences=True) raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, _ = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) model.predict(test_texts)
for c in elem.find("textwithnamedentities").children: if type(c) == Tag: if c.name == "namedentityintext": label = "Named Entity" # part of a named entity else: label = "<PAD>" # irrelevant word texts.append(c.text) labels.append(label) docs.append(texts) docs_labels.append(labels) fd.close() os.remove(XML_PATH) raw_texts = ["".join(doc) for doc in docs] texts, annotations = finetune_to_indico_sequence(raw_texts, docs, docs_labels) df = pd.DataFrame({'texts': texts, 'annotations': [json.dumps(annotation) for annotation in annotations]}) df.to_csv(DATA_PATH) if __name__ == "__main__": dataset = Reuters(nrows=1000).dataframe dataset['annotations'] = [json.loads(annotation) for annotation in dataset['annotations']] trainX, testX, trainY, testY = train_test_split(dataset.texts.values, dataset.annotations.values, test_size=0.3, random_state=42) model = SequenceLabeler(verbose=False, max_length=64, chunk_long_sequences=True) model.fit(trainX, trainY) predictions = model.predict(testX) n_sample = 10 for i in range(n_sample): print(testX[i], predictions[i])
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model = SequenceLabeler(val_size=0)
none_value="<PAD>", subtoken_predictions=True) df = pd.DataFrame({ 'texts': texts, 'annotations': [json.dumps(annotation) for annotation in annotations] }) df.to_csv(DATA_PATH) if __name__ == "__main__": dataset = Reuters().dataframe dataset['annotations'] = [ json.loads(annotation) for annotation in dataset['annotations'] ] trainX, testX, trainY, testY = train_test_split(dataset.texts.values, dataset.annotations.values, test_size=0.7, random_state=42) model = SequenceLabeler(base_model=RoBERTa, batch_size=1, val_size=0., max_length=16, chunk_long_sequences=True, subtoken_predictions=True) model.fit(trainX, trainY) predictions = model.predict(testX) print(predictions) print(annotation_report(testY, predictions))
class TestSequenceLabelerTextCNN(TestModelBase): n_sample = 100 dataset_path = os.path.join("Data", "Sequence", "reuters.xml") processed_path = os.path.join("Data", "Sequence", "reuters.json") base_model = TextCNN @classmethod def _download_reuters(cls): """ Download Reuters to test directory """ path = Path(cls.dataset_path) if not path.exists(): path.parent.mkdir(parents=True, exist_ok=True) if not os.path.exists(cls.dataset_path): url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml" r = requests.get(url) with open(cls.dataset_path, "wb") as fp: fp.write(r.content) with codecs.open(cls.dataset_path, "r", "utf-8") as infile: soup = bs(infile, "html.parser") docs = [] docs_labels = [] for elem in soup.find_all("document"): texts = [] labels = [] # Loop through each child of the element under "textwithnamedentities" for c in elem.find("textwithnamedentities").children: if type(c) == Tag: if c.name == "namedentityintext": label = "Named Entity" # part of a named entity else: label = "<PAD>" # irrelevant word texts.append(c.text) labels.append(label) docs.append(texts) docs_labels.append(labels) with open(cls.processed_path, "wt") as fp: json.dump((docs, docs_labels), fp) @classmethod def setUpClass(cls): cls._download_reuters() def setUp(self): self.save_file = "tests/saved-models/test-save-load" random.seed(42) np.random.seed(42) with open(self.processed_path, "rt") as fp: self.texts, self.labels = json.load(fp) self.model = SequenceLabeler(**self.default_config()) def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions Ensure class reweighting behaves as intended """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]["confidence"], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall( test_annotations, predictions) self.assertIn("Named Entity", token_precision) self.assertIn("Named Entity", token_recall) self.assertIn("Named Entity", overlap_precision) self.assertIn("Named Entity", overlap_recall) self.model.save(self.save_file) def test_cached_predict(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, _ = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) with self.model.cached_predict(): self.model.predict(test_texts) self.model.predict(test_texts) def test_fit_predict_multi_model(self): """ Ensure model training does not error out Ensure model returns predictions """ self.model = SequenceLabeler(**self.default_config( batch_size=2, max_length=256, lm_loss_coef=0.0, multi_label_sequences=True, )) raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, _ = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]["confidence"], dict) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) model.predict(test_texts)
fd.close() os.remove(XML_PATH) raw_texts = ["".join(doc) for doc in docs] texts, annotations = finetune_to_indico_sequence( raw_texts, docs, docs_labels) df = pd.DataFrame({ 'texts': texts, 'annotations': [json.dumps(annotation) for annotation in annotations] }) df.to_csv(DATA_PATH) if __name__ == "__main__": dataset = Reuters().dataframe dataset['annotations'] = [ json.loads(annotation) for annotation in dataset['annotations'] ] trainX, testX, trainY, testY = train_test_split(dataset.texts.values, dataset.annotations.values, test_size=0.3, random_state=42) model = SequenceLabeler(batch_size=2, val_size=0., chunk_long_sequences=True) model.fit(trainX, trainY) predictions = model.predict(testX) print(annotation_report(testY, predictions))
fd.close() os.remove(XML_PATH) raw_texts = ["".join(doc) for doc in docs] texts, annotations = finetune_to_indico_sequence( raw_texts, docs, docs_labels) df = pd.DataFrame({ 'texts': texts, 'annotations': [json.dumps(annotation) for annotation in annotations] }) df.to_csv(DATA_PATH) if __name__ == "__main__": dataset = Reuters(nrows=1000).dataframe dataset['annotations'] = [ json.loads(annotation) for annotation in dataset['annotations'] ] trainX, testX, trainY, testY = train_test_split(dataset.texts, dataset.annotations, test_size=0.3, random_state=42) model = SequenceLabeler(verbose=False) model.fit(trainX, trainY) predictions = model.predict(testX) n_sample = 10 for i in range(n_sample): print(testX.values[i], predictions[i])
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model_config = dict(val_size=0) self.model_config.update(kwargs) self.model = SequenceLabeler(**self.model_config)
class TestSequenceLabeler(unittest.TestCase): n_sample = 100 dataset_path = os.path.join( 'Data', 'Sequence', 'reuters.xml' ) processed_path = os.path.join('Data', 'Sequence', 'reuters.json') @classmethod def _download_reuters(cls): """ Download Stanford Sentiment Treebank to enso `data` directory """ path = Path(cls.dataset_path) if not path.exists(): path.parent.mkdir(parents=True, exist_ok=True) if not os.path.exists(cls.dataset_path): url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml" r = requests.get(url) with open(cls.dataset_path, "wb") as fp: fp.write(r.content) with codecs.open(cls.dataset_path, "r", "utf-8") as infile: soup = bs(infile, "html.parser") docs = [] docs_labels = [] for elem in soup.find_all("document"): texts = [] labels = [] # Loop through each child of the element under "textwithnamedentities" for c in elem.find("textwithnamedentities").children: if type(c) == Tag: if c.name == "namedentityintext": label = "Named Entity" # part of a named entity else: label = "<PAD>" # irrelevant word texts.append(c.text) labels.append(label) docs.append(texts) docs_labels.append(labels) with open(cls.processed_path, 'wt') as fp: json.dump((docs, docs_labels), fp) @classmethod def setUpClass(cls): cls._download_reuters() def default_config(self, **kwargs): d = dict( batch_size=2, max_length=256, lm_loss_coef=0.0, val_size=0, interpolate_pos_embed=False, ) d.update(**kwargs) return d def setUp(self): self.save_file = 'tests/saved-models/test-save-load' random.seed(42) np.random.seed(42) with open(self.processed_path, 'rt') as fp: self.texts, self.labels = json.load(fp) self.model = SequenceLabeler( **default_config() ) def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions Ensure class reweighting behaves as intended """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, encoder=self.model.input_pipeline.text_encoder, none_value=self.model.config.pad_token ) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1 ) reweighted_model = SequenceLabeler( **default_config(class_weights={'Named Entity': 10.}) ) reweighted_model.fit(train_texts, train_annotations) reweighted_predictions = reweighted_model.predict(test_texts) reweighted_token_recall = sequence_labeling_token_recall(test_annotations, reweighted_predictions) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) token_precision = sequence_labeling_token_precision(test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision(test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall(test_annotations, predictions) self.assertIn('Named Entity', token_precision) self.assertIn('Named Entity', token_recall) self.assertIn('Named Entity', overlap_precision) self.assertIn('Named Entity', overlap_recall) self.model.save(self.save_file) self.assertGreater(reweighted_token_recall['Named Entity'], token_recall['Named Entity']) def test_cached_predict(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, encoder=self.model.input_pipeline.text_encoder, none_value=self.model.config.pad_token ) train_texts, test_texts, train_annotations, _ = train_test_split(texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) with self.model.cached_predict(): self.model.predict(test_texts) self.model.predict(test_texts) def test_fit_predict_multi_model(self): """ Ensure model training does not error out Ensure model returns predictions """ self.model = SequenceLabeler(batch_size=2, max_length=256, lm_loss_coef=0.0, multi_label_sequences=True) raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, encoder=self.model.input_pipeline.text_encoder, none_value=self.model.config.pad_token ) train_texts, test_texts, train_annotations, _ = train_test_split(texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) model.predict(test_texts)
subtoken_predictions=True) df = pd.DataFrame({ 'texts': texts, 'annotations': [json.dumps(annotation) for annotation in annotations] }) df.to_csv(DATA_PATH) if __name__ == "__main__": dataset = Reuters().dataframe dataset['annotations'] = [ json.loads(annotation) for annotation in dataset['annotations'] ] trainX, testX, trainY, testY = train_test_split(dataset.texts.values, dataset.annotations.values, test_size=0.3, random_state=42) model = SequenceLabeler(base_model=GPT2, batch_size=2, val_size=0., max_length=16, chunk_long_sequences=True, subtoken_predictions=True, filter_empty_examples=True) model.fit(trainX, trainY) predictions = model.predict(testX) print(predictions) print(annotation_report(testY, predictions))
class TestSequenceLabeler(unittest.TestCase): n_sample = 100 n_hidden = 768 dataset_path = os.path.join( 'Data', 'Sequence', 'reuters.xml' ) processed_path = os.path.join('Data', 'Sequence', 'reuters.json') @classmethod def _download_reuters(cls): """ Download Stanford Sentiment Treebank to enso `data` directory """ path = Path(cls.dataset_path) if not path.exists(): path.parent.mkdir(parents=True, exist_ok=True) if not os.path.exists(cls.dataset_path): url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml" r = requests.get(url) with open(cls.dataset_path, "wb") as fp: fp.write(r.content) with codecs.open(cls.dataset_path, "r", "utf-8") as infile: soup = bs(infile, "html5lib") docs = [] docs_labels = [] for elem in soup.find_all("document"): texts = [] labels = [] # Loop through each child of the element under "textwithnamedentities" for c in elem.find("textwithnamedentities").children: if type(c) == Tag: if c.name == "namedentityintext": label = "Named Entity" # part of a named entity else: label = "<PAD>" # irrelevant word texts.append(c.text) labels.append(label) docs.append(texts) docs_labels.append(labels) with open(cls.processed_path, 'wt') as fp: json.dump((docs, docs_labels), fp) @classmethod def setUpClass(cls): cls._download_reuters() def setUp(self): self.save_file = 'tests/saved-models/test-save-load' with open(self.processed_path, 'rt') as fp: self.texts, self.labels = json.load(fp) tf.reset_default_graph() self.model = SequenceLabeler(batch_size=2, max_length=256, verbose=False) def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence(raw_docs, self.texts, self.labels) train_texts, test_texts, train_annotations, test_annotations = train_test_split(texts, annotations) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts)