class DSLSharedTaskDataset(DatasetReader): def __init__(self): super(DSLSharedTaskDataset, self).__init__(lazy=False) self.tokenizer = SpacyTokenizer() self.token_indexers = {'tokens': SingleIdTokenIndexer()} def _read(self, text_path: str) -> Iterable[Instance]: with open(text_path, "r") as text_data: text_data = text_data.read().splitlines() for line in text_data: try: text, label = line.strip().split('\t') except ValueError: print(line) text_field = TextField(self.tokenizer.tokenize(text), self.token_indexers) label_field = LabelField(label) fields = {'text': text_field, 'label': label_field} yield Instance(fields) def text_to_instance(self, text: str, label: str = None) -> Instance: tokens = self.tokenizer.tokenize(text) text_field = TextField(tokens, self.token_indexers) fields = {'text': text_field} if label: fields['label'] = LabelField(label) return Instance(fields)
def read_dataset(file_path): with open(file_path) as dataset_file: tokenizer = SpacyTokenizer() dataset_json = json.load(dataset_file) dialogs = [] for dialog in dataset_json: dialog_idx = dialog["dialogue_idx"] dialog = dialog['dialogue'] dialog_context = None for turn_i, turn in enumerate(dialog): sys_utt = turn['system_transcript'] user_utt = turn['transcript'] tokenized_sys_utt = tokenizer.tokenize(sys_utt) if turn_i != 0: tokenized_sys_utt = [Token(text="<S>", lemma_="<S>") ] + tokenized_sys_utt tokenized_user_utt = tokenizer.tokenize(user_utt) if turn_i != len(dialog) - 1: tokenized_user_utt = tokenized_user_utt + [ Token(text="</S>", lemma_="</S>") ] if dialog_context is None: dialog_context = tokenized_sys_utt + tokenized_user_utt else: dialog_context += tokenized_sys_utt + tokenized_user_utt dialog_context = [t.text for t in dialog_context] dialogs.append((dialog_idx, [dialog_context])) return dialogs
def test_squad_with_unwordpieceable_passage(self): tokenizer = SpacyTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ( "There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:" ) question1 = "Who released A Study of High Definition Television Systems?" passage2 = ( "Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import ( make_reading_comprehension_instance, ) instance1 = make_reading_comprehension_instance( tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1, ) instance2 = make_reading_comprehension_instance( tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2, ) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def test_keep_spacy_tokens(self): word_tokenizer = SpacyTokenizer() sentence = "This should be an allennlp Token" tokens = word_tokenizer.tokenize(sentence) assert tokens assert all(isinstance(token, Token) for token in tokens) word_tokenizer = SpacyTokenizer(keep_spacy_tokens=True) sentence = "This should be a spacy Token" tokens = word_tokenizer.tokenize(sentence) assert tokens assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
def test_empty_list_can_be_tensorized(self): tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) list_field = ListField([text_field.empty_field()]) fields = { "list": list_field, "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer), } instance = Instance(fields) instance.index_fields(self.vocab) instance.as_tensor_dict()
def test_never_lowercase(self): # Our default tokenizer doesn't handle lowercasing. tokenizer = SpacyTokenizer() # 2 15 10 11 6 sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) tokens.append(Token("[PAD]")) # have to do this b/c tokenizer splits it in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # PAD should get recognized and not lowercased # [PAD] assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 0, 17] # Unless we manually override the never lowercases token_indexer = PretrainedBertIndexer( str(vocab_path), do_lowercase=True, never_lowercase=() ) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # now PAD should get lowercased and be UNK # [UNK] assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 1, 17]
def test_char_span_to_token_span_handles_hard_cases(self): # An earlier version of the code had a hard time when the answer was the last token in the # passage. This tests that case, on the instance that used to fail. tokenizer = SpacyTokenizer() passage = ( "Beyonc\u00e9 is believed to have first started a relationship with Jay Z " + 'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh ' + "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay " + "Z's girlfriend in the music video for the song, which would further fuel " + "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were " + "married without publicity. As of April 2014, the couple have sold a combined 300 " + "million records together. The couple are known for their private relationship, " + "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 " + 'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" ' + "she had ever endured. She returned to the studio and wrote music in order to cope " + "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order " + "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris." ) start = 912 end = 912 + len("Paris.") tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] token_span = util.char_span_to_token_span(offsets, (start, end))[0] assert token_span == (184, 185)
def read(fn: str) -> Iterable[List[Extraction]]: tokenizer = SpacyTokenizer(pos_tags=True) prev_sent: List[Extraction] = [] with open(fn) as fin: for line in tqdm(fin): data = line.strip().split("\t") confidence = data[0] if not all(data[2:5]): # Make sure that all required elements are present continue arg1, rel, args2 = (parse_element(e) for e in data[2:5]) # Exactly one subject and one relation # and at least one object if len(rel) == 1 and len(arg1) == 1 and len(args2) >= 1: sent = data[5] cur_ex = Extraction( sent=sent, toks=tokenizer.tokenize(sent), arg1=arg1[0], rel=rel[0], args2=args2, confidence=confidence, ) # Decide whether to append or yield if not prev_sent or prev_sent[0].sent == sent: prev_sent.append(cur_ex) else: yield prev_sent prev_sent = [cur_ex] if prev_sent: # Yield last element yield prev_sent
def search( tables_directory: str, data: JsonDict, output_path: str, max_path_length: int, max_num_logical_forms: int, use_agenda: bool, output_separate_files: bool, conservative_agenda: bool, ) -> None: print(f"Starting search with {len(data)} instances", file=sys.stderr) language_logger = logging.getLogger("allennlp.semparse.domain_languages.wikitables_language") language_logger.setLevel(logging.ERROR) tokenizer = SpacyTokenizer() if output_separate_files and not os.path.exists(output_path): os.makedirs(output_path) if not output_separate_files: output_file_pointer = open(output_path, "w") for instance_data in data: utterance = instance_data["question"] question_id = instance_data["id"] if utterance.startswith('"') and utterance.endswith('"'): utterance = utterance[1:-1] # For example: csv/200-csv/47.csv -> tagged/200-tagged/47.tagged table_file = instance_data["table_filename"].replace("csv", "tagged") target_list = instance_data["target_values"] tokenized_question = tokenizer.tokenize(utterance) table_file = f"{tables_directory}/{table_file}" context = TableQuestionContext.read_from_file(table_file, tokenized_question) world = WikiTablesLanguage(context) walker = ActionSpaceWalker(world, max_path_length=max_path_length) correct_logical_forms = [] if use_agenda: agenda = world.get_agenda(conservative=conservative_agenda) allow_partial_match = not conservative_agenda all_logical_forms = walker.get_logical_forms_with_agenda( agenda=agenda, max_num_logical_forms=10000, allow_partial_match=allow_partial_match ) else: all_logical_forms = walker.get_all_logical_forms(max_num_logical_forms=10000) for logical_form in all_logical_forms: if world.evaluate_logical_form(logical_form, target_list): correct_logical_forms.append(logical_form) if output_separate_files and correct_logical_forms: with gzip.open(f"{output_path}/{question_id}.gz", "wt") as output_file_pointer: for logical_form in correct_logical_forms: print(logical_form, file=output_file_pointer) elif not output_separate_files: print(f"{question_id} {utterance}", file=output_file_pointer) if use_agenda: print(f"Agenda: {agenda}", file=output_file_pointer) if not correct_logical_forms: print("NO LOGICAL FORMS FOUND!", file=output_file_pointer) for logical_form in correct_logical_forms[:max_num_logical_forms]: print(logical_form, file=output_file_pointer) print(file=output_file_pointer) if not output_separate_files: output_file_pointer.close()
def test_passes_through_correctly(self): tokenizer = SpacyTokenizer() sentence = "this (sentence) has 'crazy' \"punctuation\"." tokens = [t.text for t in tokenizer.tokenize(sentence)] expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"", "punctuation", "\"", "." ] self.assertSequenceEqual(tokens, expected_tokens)
def test_crashes_with_empty_feature_value_and_no_default(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() vocab.add_token_to_namespace("ROOT", namespace="dep_labels") vocab.add_token_to_namespace("NONE", namespace="dep_labels") indexer = SingleIdTokenIndexer(namespace="dep_labels", feature_name="dep_") with pytest.raises(ValueError): indexer.tokens_to_indices([tokens[-1]], vocab)
def test_no_namespace_means_no_counting(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = SingleIdTokenIndexer(namespace=None, feature_name="text_id") def fail(): assert False counter = defaultdict(fail) for token in tokens: indexer.count_vocab_items(token, counter)
class SentenceClassifierPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyTokenizer() def predict(self, sentence: str) -> JsonDict: return self.predict_json({"sentence": sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: sentence = json_dict["sentence"] tokens = self._tokenizer.tokenize(sentence) return self._dataset_reader.text_to_instance(tokens)
class CitationDataSetReader(DatasetReader): """ We implement this CitationDataSetReader class by subclassing DatasetReader class, we also need to override some super class methods This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances. We used the default SpacyTokenizer for this project. We also need to register this dataset reader, for the Config files to be able to use this class. """ def __init__(self): super().__init__() # default Spacy Tokenizer self.tokenizer = SpacyTokenizer() @overrides def _read(self, file_path: str) -> Iterable[Instance]: """ This function reads the JSON Lines file, tokenize the text for each data point and returns a collection of Instances, each instance with tokens and label :param file_path: takes the file path as an Argument :return: returns a collection of Instances """ ds_reader = DataReaderJsonLines(file_path) for citation in ds_reader.read(): yield self.text_to_instance(citation_text=citation.text, intent=citation.intent) @overrides def text_to_instance(self, citation_text: str, intent: str) -> Instance: """ :param citation_text: text from the data point :param intent: true label of the data instance :return: returns Instance class object with tokens & label fields. """ citation_tokens = self.tokenizer.tokenize(citation_text) # Use ELMO Token Characters Indexer token_indexers = { "elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer() } fields = { 'tokens': TextField(citation_tokens, token_indexers), 'label': LabelField(intent) } return Instance(fields)
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", "words") self.vocab.add_token_to_namespace("s", "characters") self.vocab.add_token_to_namespace("e", "characters") self.vocab.add_token_to_namespace("n", "characters") self.vocab.add_token_to_namespace("t", "characters") self.vocab.add_token_to_namespace("c", "characters") for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]: self.vocab.add_token_to_namespace(label, "labels") self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) empty_list_field = ListField([text_field.empty_field()]) empty_fields = {"list_tensor": empty_list_field} self.empty_instance = Instance(empty_fields) non_empty_list_field = ListField([text_field]) non_empty_fields = {"list_tensor": non_empty_list_field} self.non_empty_instance = Instance(non_empty_fields) super().setUp()
class SentenceClassifierPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True) def predict(self, sentence: str) -> JsonDict: return self.predict_json({"sentence": sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: sentence = json_dict["sentence"] tokens = self._tokenizer.tokenize(sentence) return self._dataset_reader.text_to_instance([str(t) for t in tokens])
def test_enumerate_spans_enumerates_all_spans(self): tokenizer = SpacyTokenizer(pos_tags=True) sentence = tokenizer.tokenize("This is a sentence.") spans = span_utils.enumerate_spans(sentence) assert spans == [ (0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (2, 4), (3, 3), (3, 4), (4, 4), ] spans = span_utils.enumerate_spans(sentence, max_span_width=3, min_span_width=2) assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (2, 4), (3, 4)] spans = span_utils.enumerate_spans(sentence, max_span_width=3, min_span_width=2, offset=20) assert spans == [(20, 21), (20, 22), (21, 22), (21, 23), (22, 23), (22, 24), (23, 24)] def no_prefixed_punctuation(tokens: List[Token]): # Only include spans which don't start or end with punctuation. return tokens[0].pos_ != "PUNCT" and tokens[-1].pos_ != "PUNCT" spans = span_utils.enumerate_spans( sentence, max_span_width=3, min_span_width=2, filter_function=no_prefixed_punctuation) # No longer includes (2, 4) or (3, 4) as these include punctuation # as their last element. assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]
class PlainTextReader(DatasetReader): def __init__(self): super().__init__() self._token_indexers = {"tokens": SingleIdTokenIndexer()} self._tokenizer = SpacyTokenizer() def _read(self, file_path: str) -> Iterable[Instance]: with open(file_path) as input_file: for line in input_file: yield self.text_to_instance(line) def text_to_instance(self, line: str) -> Instance: # type: ignore tokens = self._tokenizer.tokenize(line) return Instance({"line": TextField(tokens, self._token_indexers)})
def test_tokens_to_indices_with_non_default_feature_name(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace("ROOT", namespace="dep_labels") none_index = vocab.add_token_to_namespace("NONE", namespace="dep_labels") indexer = SingleIdTokenIndexer(namespace="dep_labels", feature_name="dep_", default_value="NONE") assert indexer.tokens_to_indices([tokens[1]], vocab) == { "tokens": [root_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab) == { "tokens": [none_index] }
def test_count_vocab_items_with_non_default_feature_name(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = SingleIdTokenIndexer(namespace="dep_labels", feature_name="dep_", default_value="NONE") counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["dep_labels"] == { "ROOT": 1, "nsubj": 1, "det": 1, "NONE": 2, "attr": 1, "punct": 1, }
def test_char_span_to_token_span_handles_easy_cases(self): # These are _inclusive_ spans, on both sides. tokenizer = SpacyTokenizer() passage = ( "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " + "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " + "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " + "first performances since giving birth to Blue Ivy." ) tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # "January 7, 2012" token_span = util.char_span_to_token_span(offsets, (3, 18))[0] assert token_span == (1, 4) # "Lenox Hill Hospital" token_span = util.char_span_to_token_span(offsets, (91, 110))[0] assert token_span == (22, 24) # "Lenox Hill Hospital in New York." token_span = util.char_span_to_token_span(offsets, (91, 123))[0] assert token_span == (22, 28)
def test_token_type_ids(self): tokenizer = SpacyTokenizer() sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) # 2 15 10 11 6 17 2 15 10 11 6 # the laziest fox [SEP] the laziest fox tokens = ( tokens + [Token("[SEP]")] + tokens ) # have to do this b/c tokenizer splits `[SEP]` in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # [CLS] 2, 15, 10, 11, 6, 17, 2 15, 10, 11, 6, [SEP] assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
def test_char_span_to_token_span_handles_out_of_bounds_start_end(self): tokenizer = SpacyTokenizer() passage = "This sentence is just for testing purposes" tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # scenario 1: negative start character span (this should really never happen) start = -1 end = start + len("This") expected_span = (0, 0) token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 2: end character span exceeds sentence length, for whichever reason start = 34 end = start + len("purposes") + 1 expected_span = (6, 6) token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error
def test_char_span_to_token_span_handles_undertokenization(self): tokenizer = SpacyTokenizer() passage = "This sentence will have two under tokenized tokens, one#here and one at the#end" tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # scenario 1: under tokenized in the middle of the sentence, look for the first part of the token start = 52 end = start + len("one") expected_span = (9, 9) # the indices of the whole "one&here" token should be returned token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 2: under tokenized in the middle of the sentence, look for the second part of the token start = 56 end = start + len("here") expected_span = (9, 9) # the indices of the whole "one&here" token should be returned token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 3: under tokenized at the end of the sentence, look for the first part of the token start = 72 end = start + len("the") expected_span = (13, 13) # the indices of the whole "the&end" token should be returned token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 4: under tokenized at the end of the sentence, look for the second part of the token # this used to cause an IndexError start = 76 end = start + len("end") expected_span = (13, 13) # the indices of the whole "the&end" token should be returned token_span, errory = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error
class MyReader(DatasetReader): """ Just reads in a text file and sticks each line in a `TextField` with the specified name. """ def __init__(self, field_name: str) -> None: super().__init__() self.field_name = field_name self.tokenizer = SpacyTokenizer() self.token_indexers: Dict[str, TokenIndexer] = { "tokens": SingleIdTokenIndexer() } def text_to_instance(self, sentence: str) -> Instance: # type: ignore tokens = self.tokenizer.tokenize(sentence) return Instance( {self.field_name: TextField(tokens, self.token_indexers)}) def _read(self, file_path: str): with open(file_path) as data_file: for line in data_file: yield self.text_to_instance(line)
def test_do_lowercase(self): # Our default tokenizer doesn't handle lowercasing. tokenizer = SpacyTokenizer() # Quick is UNK because of capitalization # 2 1 5 6 8 9 2 15 10 11 14 1 sentence = "the Quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=False) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # Quick should get 1 == OOV assert indexed_tokens["input_ids"] == [16, 2, 1, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] # Does lowercasing by default token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # Now Quick should get indexed correctly as 3 ( == "quick") assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
class OpenIePredictor(Predictor): """ Predictor for the [`SemanticRolelabeler`](../models/semantic_role_labeler.md) model (in its Open Information variant). Used by online demo and for prediction on an input file using command line. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyTokenizer(pos_tags=True) def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like `{"sentence": "...", "predicate_index": "..."}`. Assumes sentence is tokenized, and that predicate_index points to a specific predicate (word index) within the sentence, for which to produce Open IE extractions. """ tokens = json_dict["sentence"] predicate_index = int(json_dict["predicate_index"]) verb_labels = [0 for _ in tokens] verb_labels[predicate_index] = 1 return self._dataset_reader.text_to_instance(tokens, verb_labels) @overrides def predict_json(self, inputs: JsonDict) -> JsonDict: """ Create instance(s) after predicting the format. One sentence containing multiple verbs will lead to multiple instances. Expects JSON that looks like `{"sentence": "..."}` Returns a JSON that looks like .. code-block:: js {"tokens": [...], "tag_spans": [{"ARG0": "...", "V": "...", "ARG1": "...", ...}]} """ sent_tokens = self._tokenizer.tokenize(inputs["sentence"]) # Find all verbs in the input sentence pred_ids = [i for (i, t) in enumerate(sent_tokens) if t.pos_ == "VERB"] # Create instances instances = [ self._json_to_instance({ "sentence": sent_tokens, "predicate_index": pred_id }) for pred_id in pred_ids ] # Run model outputs = [[ sanitize_label(label) for label in self._model.forward_on_instance(instance)["tags"] ] for instance in instances] # Consolidate predictions pred_dict = consolidate_predictions(outputs, sent_tokens) # Build and return output dictionary results = {"verbs": [], "words": sent_tokens} for tags in pred_dict.values(): # Join multi-word predicates tags = join_mwp(tags) # Create description text description = make_oie_string(sent_tokens, tags) # Add a predicate prediction to the return dictionary. results["verbs"].append({ "verb": get_predicate_text(sent_tokens, tags), "description": description, "tags": tags, }) return sanitize(results)
class TestSpacyTokenizer(AllenNlpTestCase): def setup_method(self): super().setup_method() self.word_tokenizer = SpacyTokenizer() def test_tokenize_handles_complex_punctuation(self): sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", '"', "punctuation", '"', ".", ] tokens = self.word_tokenizer.tokenize(sentence) token_text = [t.text for t in tokens] assert token_text == expected_tokens for token in tokens: start = token.idx end = start + len(token.text) assert sentence[start:end] == token.text def test_tokenize_handles_contraction(self): # note that "would've" is kept together, while "ain't" is not. sentence = "it ain't joe's problem; would been yesterday" expected_tokens = [ "it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been", "yesterday", ] tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)] assert tokens == expected_tokens def test_tokenize_handles_multiple_contraction(self): sentence = "wouldn't've" expected_tokens = ["would", "n't", "'ve"] tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)] assert tokens == expected_tokens def test_tokenize_handles_final_apostrophe(self): sentence = "the jones' house" expected_tokens = ["the", "jones", "'", "house"] tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)] assert tokens == expected_tokens def test_tokenize_removes_whitespace_tokens(self): sentence = "the\n jones' house \x0b 55" expected_tokens = ["the", "jones", "'", "house", "55"] tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)] assert tokens == expected_tokens def test_tokenize_handles_special_cases(self): # note that the etc. doesn't quite work --- we can special case this if we want. sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store" expected_tokens = [ "Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",", "e.g.", ",", "the", "store", ] tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)] assert tokens == expected_tokens def test_batch_tokenization(self): sentences = [ "This is a sentence", "This isn't a sentence.", "This is the 3rd sentence." "Here's the 'fourth' sentence.", ] batch_split = self.word_tokenizer.batch_tokenize(sentences) separately_split = [self.word_tokenizer.tokenize(sentence) for sentence in sentences] assert len(batch_split) == len(separately_split) for batch_sentence, separate_sentence in zip(batch_split, separately_split): assert len(batch_sentence) == len(separate_sentence) for batch_word, separate_word in zip(batch_sentence, separate_sentence): assert batch_word.text == separate_word.text def test_keep_spacy_tokens(self): word_tokenizer = SpacyTokenizer() sentence = "This should be an allennlp Token" tokens = word_tokenizer.tokenize(sentence) assert tokens assert all(isinstance(token, Token) for token in tokens) word_tokenizer = SpacyTokenizer(keep_spacy_tokens=True) sentence = "This should be a spacy Token" tokens = word_tokenizer.tokenize(sentence) assert tokens assert all(isinstance(token, spacy.tokens.Token) for token in tokens) def test_to_params(self): tokenizer = SpacyTokenizer() params = tokenizer.to_params() assert isinstance(params, Params) assert params.params == { "type": "spacy", "language": tokenizer._language, "pos_tags": tokenizer._pos_tags, "parse": tokenizer._parse, "ner": tokenizer._ner, "keep_spacy_tokens": tokenizer._keep_spacy_tokens, "split_on_spaces": tokenizer._split_on_spaces, "start_tokens": tokenizer._start_tokens, "end_tokens": tokenizer._end_tokens, }
class TestTableQuestionContext(SemparseTestCase): def setup_method(self): super().setup_method() self.tokenizer = SpacyTokenizer(pos_tags=True) def test_table_data(self): question = "what was the attendance when usl a league played?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) assert table_question_context.table_data == [ { "date_column:year": Date(2001, -1, -1), "number_column:year": 2001.0, "string_column:year": "2001", "number_column:division": 2.0, "string_column:division": "2", "string_column:league": "usl_a_league", "string_column:regular_season": "4th_western", "number_column:regular_season": 4.0, "string_column:playoffs": "quarterfinals", "string_column:open_cup": "did_not_qualify", "number_column:open_cup": None, "number_column:avg_attendance": 7169.0, "string_column:avg_attendance": "7_169", }, { "date_column:year": Date(2005, -1, -1), "number_column:year": 2005.0, "string_column:year": "2005", "number_column:division": 2.0, "string_column:division": "2", "string_column:league": "usl_first_division", "string_column:regular_season": "5th", "number_column:regular_season": 5.0, "string_column:playoffs": "quarterfinals", "string_column:open_cup": "4th_round", "number_column:open_cup": 4.0, "number_column:avg_attendance": 6028.0, "string_column:avg_attendance": "6_028", }, ] def test_table_data_from_untagged_file(self): question = "what was the attendance when usl a league played?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/wikitables/sample_table.tsv" table_lines = [line.strip() for line in open(test_file).readlines()] table_question_context = TableQuestionContext.read_from_lines( table_lines, question_tokens) # The content in the table represented by the untagged file we are reading here is the same as the one we # had in the tagged file above, except that we have a "Score" column instead of "Avg. Attendance" column, # which is changed to test the num2 extraction logic. I've shown the values not being extracted here as # well and commented them out. assert table_question_context.table_data == [ { "number_column:year": 2001.0, # The value extraction logic we have for untagged lines does # not extract this value as a date. # 'date_column:year': Date(2001, -1, -1), "string_column:year": "2001", "number_column:division": 2.0, "string_column:division": "2", "string_column:league": "usl_a_league", "string_column:regular_season": "4th_western", # We only check for strings that are entirely numbers. So 4.0 # will not be extracted. # 'number_column:regular_season': 4.0, "string_column:playoffs": "quarterfinals", "string_column:open_cup": "did_not_qualify", # 'number_column:open_cup': None, "number_column:score": 20.0, "num2_column:score": 30.0, "string_column:score": "20_30", }, { "number_column:year": 2005.0, # 'date_column:year': Date(2005, -1, -1), "string_column:year": "2005", "number_column:division": 2.0, "string_column:division": "2", "string_column:league": "usl_first_division", "string_column:regular_season": "5th", # Same here as in the "division" column for the first row. # 5.0 will not be extracted from "5th". # 'number_column:regular_season': 5.0, "string_column:playoffs": "quarterfinals", "string_column:open_cup": "4th_round", # 'number_column:open_cup': 4.0, "number_column:score": 50.0, "num2_column:score": 40.0, "string_column:score": "50_40", }, ] def test_number_extraction(self): question = """how many players on the 191617 illinois fighting illini men's basketball team had more than 100 points scored?""" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, number_entities = table_question_context.get_entities_from_question( ) assert number_entities == [("191617", 5), ("100", 16)] def test_date_extraction(self): question = "how many laps did matt kenset complete on february 26, 2006." question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-8.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, number_entities = table_question_context.get_entities_from_question( ) assert number_entities == [("2", 8), ("26", 9), ("2006", 11)] def test_date_extraction_2(self): question = """how many different players scored for the san jose earthquakes during their 1979 home opener against the timbers?""" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-6.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, number_entities = table_question_context.get_entities_from_question( ) assert number_entities == [("1979", 12)] def test_multiword_entity_extraction(self): question = "was the positioning better the year of the france venue or the year of the south korea venue?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-3.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) entities, _ = table_question_context.get_entities_from_question() assert entities == [ ("string:france", ["string_column:venue"]), ("string:south_korea", ["string_column:venue"]), ] def test_rank_number_extraction(self): question = "what was the first tamil-language film in 1943?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-1.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, numbers = table_question_context.get_entities_from_question() assert numbers == [("1", 3), ("1943", 9)] def test_null_extraction(self): question = "on what date did the eagles score the least points?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-2.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) entities, numbers = table_question_context.get_entities_from_question() # "Eagles" does not appear in the table. assert entities == [] assert numbers == [] def test_numerical_column_type_extraction(self): question = """how many players on the 191617 illinois fighting illini men's basketball team had more than 100 points scored?""" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) column_names = table_question_context.column_names assert "number_column:games_played" in column_names assert "number_column:field_goals" in column_names assert "number_column:free_throws" in column_names assert "number_column:points" in column_names def test_date_column_type_extraction_1(self): question = "how many were elected?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-5.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) column_names = table_question_context.column_names assert "date_column:first_elected" in column_names def test_date_column_type_extraction_2(self): question = "how many were elected?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-9.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) column_names = table_question_context.column_names assert "date_column:date_of_appointment" in column_names assert "date_column:date_of_election" in column_names def test_string_column_types_extraction(self): question = "how many were elected?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-10.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) column_names = table_question_context.column_names assert "string_column:birthplace" in column_names assert "string_column:advocate" in column_names assert "string_column:notability" in column_names assert "string_column:name" in column_names def test_number_and_entity_extraction(self): question = "other than m1 how many notations have 1 in them?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) string_entities, number_entities = table_question_context.get_entities_from_question( ) assert string_entities == [ ("string:m1", ["string_column:notation"]), ("string:1", ["string_column:position"]), ] assert number_entities == [("1", 2), ("1", 7)] def test_get_knowledge_graph(self): question = "other than m1 how many notations have 1 in them?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) knowledge_graph = table_question_context.get_table_knowledge_graph() entities = knowledge_graph.entities # -1 is not in entities because there are no date columns in the table. assert sorted(entities) == [ "1", "number_column:notation", "number_column:position", "string:1", "string:m1", "string_column:mnemonic", "string_column:notation", "string_column:position", "string_column:short_name", "string_column:swara", ] neighbors = knowledge_graph.neighbors # Each number extracted from the question will have all number and date columns as # neighbors. Each string entity extracted from the question will only have the corresponding # column as the neighbor. neighbors_with_sets = { key: set(value) for key, value in neighbors.items() } assert neighbors_with_sets == { "1": {"number_column:position", "number_column:notation"}, "string_column:mnemonic": set(), "string_column:short_name": set(), "string_column:swara": set(), "number_column:position": {"1"}, "number_column:notation": {"1"}, "string:m1": {"string_column:notation"}, "string:1": {"string_column:position"}, "string_column:notation": {"string:m1"}, "string_column:position": {"string:1"}, } entity_text = knowledge_graph.entity_text assert entity_text == { "1": "1", "string:m1": "m1", "string:1": "1", "string_column:notation": "notation", "number_column:notation": "notation", "string_column:mnemonic": "mnemonic", "string_column:short_name": "short name", "string_column:swara": "swara", "number_column:position": "position", "string_column:position": "position", } def test_knowledge_graph_has_correct_neighbors(self): question = "when was the attendance greater than 5000?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) knowledge_graph = table_question_context.get_table_knowledge_graph() neighbors = knowledge_graph.neighbors # '5000' is neighbors with number and date columns. '-1' is in entities because there is a # date column, which is its only neighbor. assert set(neighbors.keys()) == { "date_column:year", "number_column:year", "string_column:year", "number_column:division", "string_column:division", "string_column:league", "string_column:regular_season", "number_column:regular_season", "string_column:playoffs", "string_column:open_cup", "number_column:open_cup", "number_column:avg_attendance", "string_column:avg_attendance", "5000", "-1", } assert set(neighbors["date_column:year"]) == {"5000", "-1"} assert neighbors["number_column:year"] == ["5000"] assert neighbors["string_column:year"] == [] assert neighbors["number_column:division"] == ["5000"] assert neighbors["string_column:division"] == [] assert neighbors["string_column:league"] == [] assert neighbors["string_column:regular_season"] == [] assert neighbors["number_column:regular_season"] == ["5000"] assert neighbors["string_column:playoffs"] == [] assert neighbors["string_column:open_cup"] == [] assert neighbors["number_column:open_cup"] == ["5000"] assert neighbors["number_column:avg_attendance"] == ["5000"] assert neighbors["string_column:avg_attendance"] == [] assert set(neighbors["5000"]) == { "date_column:year", "number_column:year", "number_column:division", "number_column:avg_attendance", "number_column:regular_season", "number_column:open_cup", } assert neighbors["-1"] == ["date_column:year"]
class IOBDatasetReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer_space = WhitespaceTokenizer() self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md", pos_tags=True, split_on_spaces=True) self.token_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', feature_name='tag_'), 'ner_tags': SingleIdTokenIndexer(namespace='ner_tag_vocab', feature_name='ent_type_') } self.slot_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6) } def text_to_instance(self, tokens: List[Token], slot: List[Token], s1_tags: List[str] = None, tags: List[str] = None) -> Instance: sentence_field = TextField(tokens, self.token_indexers) slot_field = TextField(slot, self.slot_indexers) fields = {"sentence": sentence_field, "slot": slot_field} if s1_tags: s1_field = SequenceLabelField(labels=s1_tags, sequence_field=sentence_field, label_namespace="s1_labels") fields["s1_labels"] = s1_field if tags: label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field) fields["labels"] = label_field return Instance(fields) def _read(self, file_path: str) -> Iterable[Instance]: with open(file_path) as f: for line in f: sentence, s1_label, description, tags = line.strip().split( '\t') yield self.text_to_instance( self.tokenizer_spacy.tokenize(sentence), self.tokenizer_spacy.tokenize(description), [iob for iob in s1_label.split()], [iob for iob in tags.split()])