def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer_space = WhitespaceTokenizer() self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md", pos_tags=True, split_on_spaces=True) self.token_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', feature_name='tag_'), 'ner_tags': SingleIdTokenIndexer(namespace='ner_tag_vocab', feature_name='ent_type_') } self.slot_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=6) }
def __init__(self, field_name: str) -> None: super().__init__() self.field_name = field_name self.tokenizer = SpacyTokenizer() self.token_indexers: Dict[str, TokenIndexer] = { "tokens": SingleIdTokenIndexer() }
def test_never_lowercase(self): # Our default tokenizer doesn't handle lowercasing. tokenizer = SpacyTokenizer() # 2 15 10 11 6 sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) tokens.append(Token("[PAD]")) # have to do this b/c tokenizer splits it in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # PAD should get recognized and not lowercased # [PAD] assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 0, 17] # Unless we manually override the never lowercases token_indexer = PretrainedBertIndexer( str(vocab_path), do_lowercase=True, never_lowercase=() ) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # now PAD should get lowercased and be UNK # [UNK] assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 1, 17]
class DSLSharedTaskDataset(DatasetReader): def __init__(self): super(DSLSharedTaskDataset, self).__init__(lazy=False) self.tokenizer = SpacyTokenizer() self.token_indexers = {'tokens': SingleIdTokenIndexer()} def _read(self, text_path: str) -> Iterable[Instance]: with open(text_path, "r") as text_data: text_data = text_data.read().splitlines() for line in text_data: try: text, label = line.strip().split('\t') except ValueError: print(line) text_field = TextField(self.tokenizer.tokenize(text), self.token_indexers) label_field = LabelField(label) fields = {'text': text_field, 'label': label_field} yield Instance(fields) def text_to_instance(self, text: str, label: str = None) -> Instance: tokens = self.tokenizer.tokenize(text) text_field = TextField(tokens, self.token_indexers) fields = {'text': text_field} if label: fields['label'] = LabelField(label) return Instance(fields)
def setup_method(self): self.tokenizer = SpacyTokenizer(pos_tags=True) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} table_file = self.FIXTURES_ROOT / "data" / "wikitables" / "tables" / "341.tagged" self.graph = TableQuestionContext.read_from_file( table_file, self.utterance).get_table_knowledge_graph() self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace="tokens") self.in_index = self.vocab.add_token_to_namespace("in", namespace="tokens") self.english_index = self.vocab.add_token_to_namespace( "english", namespace="tokens") self.location_index = self.vocab.add_token_to_namespace( "location", namespace="tokens") self.mersin_index = self.vocab.add_token_to_namespace( "mersin", namespace="tokens") self.oov_index = self.vocab.get_token_index("random OOV string", namespace="tokens") self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super().setup_method()
def test_char_span_to_token_span_handles_hard_cases(self): # An earlier version of the code had a hard time when the answer was the last token in the # passage. This tests that case, on the instance that used to fail. tokenizer = SpacyTokenizer() passage = ( "Beyonc\u00e9 is believed to have first started a relationship with Jay Z " + 'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh ' + "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay " + "Z's girlfriend in the music video for the song, which would further fuel " + "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were " + "married without publicity. As of April 2014, the couple have sold a combined 300 " + "million records together. The couple are known for their private relationship, " + "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 " + 'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" ' + "she had ever endured. She returned to the studio and wrote music in order to cope " + "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order " + "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris." ) start = 912 end = 912 + len("Paris.") tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] token_span = util.char_span_to_token_span(offsets, (start, end))[0] assert token_span == (184, 185)
def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm") -> None: super().__init__(model, dataset_reader) self._language = language self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)
def read(fn: str) -> Iterable[List[Extraction]]: tokenizer = SpacyTokenizer(pos_tags=True) prev_sent: List[Extraction] = [] with open(fn) as fin: for line in tqdm(fin): data = line.strip().split("\t") confidence = data[0] if not all(data[2:5]): # Make sure that all required elements are present continue arg1, rel, args2 = (parse_element(e) for e in data[2:5]) # Exactly one subject and one relation # and at least one object if len(rel) == 1 and len(arg1) == 1 and len(args2) >= 1: sent = data[5] cur_ex = Extraction( sent=sent, toks=tokenizer.tokenize(sent), arg1=arg1[0], rel=rel[0], args2=args2, confidence=confidence, ) # Decide whether to append or yield if not prev_sent or prev_sent[0].sent == sent: prev_sent.append(cur_ex) else: yield prev_sent prev_sent = [cur_ex] if prev_sent: # Yield last element yield prev_sent
def read_dataset(file_path): with open(file_path) as dataset_file: tokenizer = SpacyTokenizer() dataset_json = json.load(dataset_file) dialogs = [] for dialog in dataset_json: dialog_idx = dialog["dialogue_idx"] dialog = dialog['dialogue'] dialog_context = None for turn_i, turn in enumerate(dialog): sys_utt = turn['system_transcript'] user_utt = turn['transcript'] tokenized_sys_utt = tokenizer.tokenize(sys_utt) if turn_i != 0: tokenized_sys_utt = [Token(text="<S>", lemma_="<S>") ] + tokenized_sys_utt tokenized_user_utt = tokenizer.tokenize(user_utt) if turn_i != len(dialog) - 1: tokenized_user_utt = tokenized_user_utt + [ Token(text="</S>", lemma_="</S>") ] if dialog_context is None: dialog_context = tokenized_sys_utt + tokenized_user_utt else: dialog_context += tokenized_sys_utt + tokenized_user_utt dialog_context = [t.text for t in dialog_context] dialogs.append((dialog_idx, [dialog_context])) return dialogs
def search( tables_directory: str, data: JsonDict, output_path: str, max_path_length: int, max_num_logical_forms: int, use_agenda: bool, output_separate_files: bool, conservative_agenda: bool, ) -> None: print(f"Starting search with {len(data)} instances", file=sys.stderr) language_logger = logging.getLogger("allennlp.semparse.domain_languages.wikitables_language") language_logger.setLevel(logging.ERROR) tokenizer = SpacyTokenizer() if output_separate_files and not os.path.exists(output_path): os.makedirs(output_path) if not output_separate_files: output_file_pointer = open(output_path, "w") for instance_data in data: utterance = instance_data["question"] question_id = instance_data["id"] if utterance.startswith('"') and utterance.endswith('"'): utterance = utterance[1:-1] # For example: csv/200-csv/47.csv -> tagged/200-tagged/47.tagged table_file = instance_data["table_filename"].replace("csv", "tagged") target_list = instance_data["target_values"] tokenized_question = tokenizer.tokenize(utterance) table_file = f"{tables_directory}/{table_file}" context = TableQuestionContext.read_from_file(table_file, tokenized_question) world = WikiTablesLanguage(context) walker = ActionSpaceWalker(world, max_path_length=max_path_length) correct_logical_forms = [] if use_agenda: agenda = world.get_agenda(conservative=conservative_agenda) allow_partial_match = not conservative_agenda all_logical_forms = walker.get_logical_forms_with_agenda( agenda=agenda, max_num_logical_forms=10000, allow_partial_match=allow_partial_match ) else: all_logical_forms = walker.get_all_logical_forms(max_num_logical_forms=10000) for logical_form in all_logical_forms: if world.evaluate_logical_form(logical_form, target_list): correct_logical_forms.append(logical_form) if output_separate_files and correct_logical_forms: with gzip.open(f"{output_path}/{question_id}.gz", "wt") as output_file_pointer: for logical_form in correct_logical_forms: print(logical_form, file=output_file_pointer) elif not output_separate_files: print(f"{question_id} {utterance}", file=output_file_pointer) if use_agenda: print(f"Agenda: {agenda}", file=output_file_pointer) if not correct_logical_forms: print("NO LOGICAL FORMS FOUND!", file=output_file_pointer) for logical_form in correct_logical_forms[:max_num_logical_forms]: print(logical_form, file=output_file_pointer) print(file=output_file_pointer) if not output_separate_files: output_file_pointer.close()
def test_passes_through_correctly(self): tokenizer = SpacyTokenizer() sentence = "this (sentence) has 'crazy' \"punctuation\"." tokens = [t.text for t in tokenizer.tokenize(sentence)] expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"", "punctuation", "\"", "." ] self.assertSequenceEqual(tokens, expected_tokens)
def test_crashes_with_empty_feature_value_and_no_default(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() vocab.add_token_to_namespace("ROOT", namespace="dep_labels") vocab.add_token_to_namespace("NONE", namespace="dep_labels") indexer = SingleIdTokenIndexer(namespace="dep_labels", feature_name="dep_") with pytest.raises(ValueError): indexer.tokens_to_indices([tokens[-1]], vocab)
def test_profile(): data_path = "https://storage.googleapis.com/tyoyo/jwtd/v1.0/dev.tsv" dataset_reader = Seq2SeqDatasetReader( source_tokenizer=SpacyTokenizer(language="ja_core_news_sm"), target_tokenizer=SpacyTokenizer(language="ja_core_news_sm"), source_max_tokens=64, target_max_tokens=64, start_symbol="STARTSYMBOL", end_symbol="ENDSYMBOL", ) dataset = dataset_reader.read(data_path)
def test_empty_list_can_be_tensorized(self): tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) list_field = ListField([text_field.empty_field()]) fields = { "list": list_field, "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer), } instance = Instance(fields) instance.index_fields(self.vocab) instance.as_tensor_dict()
def test_no_namespace_means_no_counting(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = SingleIdTokenIndexer(namespace=None, feature_name="text_id") def fail(): assert False counter = defaultdict(fail) for token in tokens: indexer.count_vocab_items(token, counter)
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", "words") self.vocab.add_token_to_namespace("s", "characters") self.vocab.add_token_to_namespace("e", "characters") self.vocab.add_token_to_namespace("n", "characters") self.vocab.add_token_to_namespace("t", "characters") self.vocab.add_token_to_namespace("c", "characters") for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]: self.vocab.add_token_to_namespace(label, "labels") self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) empty_list_field = ListField([text_field.empty_field()]) empty_fields = {"list_tensor": empty_list_field} self.empty_instance = Instance(empty_fields) non_empty_list_field = ListField([text_field]) non_empty_fields = {"list_tensor": non_empty_list_field} self.non_empty_instance = Instance(non_empty_fields) super().setUp()
def __init__( self, token_indexers: Dict[str, TokenIndexer], max_sequence_length: int = None, keep_prob: float = 1.0, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._max_sequence_length = max_sequence_length self._token_indexers = token_indexers self._tokenizer = SpacyTokenizer() self._keep_prob = keep_prob self._bert = "bert" in token_indexers
def test_enumerate_spans_enumerates_all_spans(self): tokenizer = SpacyTokenizer(pos_tags=True) sentence = tokenizer.tokenize("This is a sentence.") spans = span_utils.enumerate_spans(sentence) assert spans == [ (0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (2, 4), (3, 3), (3, 4), (4, 4), ] spans = span_utils.enumerate_spans(sentence, max_span_width=3, min_span_width=2) assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (2, 4), (3, 4)] spans = span_utils.enumerate_spans(sentence, max_span_width=3, min_span_width=2, offset=20) assert spans == [(20, 21), (20, 22), (21, 22), (21, 23), (22, 23), (22, 24), (23, 24)] def no_prefixed_punctuation(tokens: List[Token]): # Only include spans which don't start or end with punctuation. return tokens[0].pos_ != "PUNCT" and tokens[-1].pos_ != "PUNCT" spans = span_utils.enumerate_spans( sentence, max_span_width=3, min_span_width=2, filter_function=no_prefixed_punctuation) # No longer includes (2, 4) or (3, 4) as these include punctuation # as their last element. assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]
def test_to_params(self): tokenizer = SpacyTokenizer() params = tokenizer.to_params() assert isinstance(params, Params) assert params.params == { "type": "spacy", "language": tokenizer._language, "pos_tags": tokenizer._pos_tags, "parse": tokenizer._parse, "ner": tokenizer._ner, "keep_spacy_tokens": tokenizer._keep_spacy_tokens, "split_on_spaces": tokenizer._split_on_spaces, "start_tokens": tokenizer._start_tokens, "end_tokens": tokenizer._end_tokens, }
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, passage_length_limit: int = None, question_length_limit: int = None, skip_when_all_empty: List[str] = None, instance_format: str = "drop", relaxed_span_match_for_finding_labels: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.passage_length_limit = passage_length_limit self.question_length_limit = question_length_limit self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else [] for item in self.skip_when_all_empty: assert item in [ "passage_span", "question_span", "addition_subtraction", "counting", ], f"Unsupported skip type: {item}" self.instance_format = instance_format self.relaxed_span_match_for_finding_labels = relaxed_span_match_for_finding_labels
def __init__( self, target_namespace: str, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, **kwargs, ) -> None: super().__init__(**kwargs) self._target_namespace = target_namespace self._source_tokenizer = source_tokenizer or SpacyTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()} self._target_token_indexers: Dict[str, TokenIndexer] = { "tokens": SingleIdTokenIndexer(namespace=self._target_namespace) } if ( isinstance(self._target_tokenizer, PretrainedTransformerTokenizer) and self._target_tokenizer._add_special_tokens ): warnings.warn( "'add_special_tokens' is True for target_tokenizer, which is a PretrainedTransformerTokenizer. " "This means special tokens, such as '[CLS]' and '[SEP]', will probably end up in " "your model's predicted target sequences. " "If this is not what you intended, make sure to specify 'add_special_tokens: False' for " "your target_tokenizer.", UserWarning, )
def __init__( self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, source_add_end_token: bool = True, delimiter: str = "\t", source_max_tokens: Optional[int] = None, target_max_tokens: Optional[int] = None, **kwargs, ) -> None: super().__init__(**kwargs) self._source_tokenizer = source_tokenizer or SpacyTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or { "tokens": SingleIdTokenIndexer() } self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token self._source_add_end_token = source_add_end_token self._delimiter = delimiter self._source_max_tokens = source_max_tokens self._target_max_tokens = target_max_tokens self._source_max_exceeded = 0 self._target_max_exceeded = 0
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, text_key: str = "text", label_key: str = "label", **kwargs, ) -> None: super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._text_key = text_key self._label_key = label_key if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, hyperbolic_phrase_indexers: Dict[str, TokenIndexer] = None, max_sequence_length: int = None, start_tokens: List[str] = None, end_tokens: List[str] = None, rare_frequency: int = 10) -> None: super().__init__() self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace='euclidean') } self._hyperbolic_phrase_indexers = hyperbolic_phrase_indexers or { "tokens": SingleIdTokenIndexer(namespace='hyperbolic') } if max_sequence_length is not None: self._max_sequence_length: Union[ float, Optional[int]] = max_sequence_length else: self._max_sequence_length = math.inf self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])] self._rare_frequency = rare_frequency logger.info("Creating SimpleLanguageModelingDatasetReader") logger.info("max_sequence_length=%s", max_sequence_length)
def __init__(self, pretrained_model: str = None, tokenizer: Optional[Tokenizer] = None, token_indexers: Dict[str, TokenIndexer] = None, max_pieces: int = 512, add_prefix: bool = False, combine_input_fields: bool = True, sample: int = -1) -> None: super().__init__() if pretrained_model != None: self._tokenizer = PretrainedTransformerTokenizer( pretrained_model, max_length=max_pieces) token_indexer = PretrainedTransformerIndexer(pretrained_model) self._token_indexers = {'tokens': token_indexer} else: self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._sample = sample self._add_prefix = add_prefix self._combine_input_fields = combine_input_fields self._debug_prints = -1
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, passage_length_limit: int = None, question_length_limit: int = None, skip_impossible_questions: bool = False, no_answer_token: Optional[str] = None, **kwargs, ) -> None: if "skip_invalid_examples" in kwargs: import warnings warnings.warn( "'skip_invalid_examples' is deprecated, please use 'skip_impossible_questions' instead", DeprecationWarning, ) skip_impossible_questions = kwargs.pop("skip_invalid_examples") super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.passage_length_limit = passage_length_limit self.question_length_limit = question_length_limit self.skip_impossible_questions = skip_impossible_questions self.no_answer_token = no_answer_token
def __init__( self, lazy: bool = False, tables_directory: str = None, offline_logical_forms_directory: str = None, max_offline_logical_forms: int = 10, keep_if_no_logical_forms: bool = False, tokenizer: Tokenizer = None, question_token_indexers: Dict[str, TokenIndexer] = None, table_token_indexers: Dict[str, TokenIndexer] = None, use_table_for_vocab: bool = False, max_table_tokens: int = None, output_agendas: bool = False, ) -> None: super().__init__(lazy=lazy) self._tables_directory = tables_directory self._offline_logical_forms_directory = offline_logical_forms_directory self._max_offline_logical_forms = max_offline_logical_forms self._keep_if_no_logical_forms = keep_if_no_logical_forms self._tokenizer = tokenizer or SpacyTokenizer(pos_tags=True) self._question_token_indexers = question_token_indexers or { "tokens": SingleIdTokenIndexer() } self._table_token_indexers = table_token_indexers or self._question_token_indexers self._use_table_for_vocab = use_table_for_vocab self._max_table_tokens = max_table_tokens self._output_agendas = output_agendas
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, frontend_reader: str = None, frontend_args: Dict[str, Any] = {}, lazy: bool = False, concatenate_instances: str = None, concatenate_frontend_reader: str = None, concatenate_frontend_args: Dict[str, Any] = None, sentence1_name: str = "hypothesis", sentence2_name: str = "premise", **kwargs) -> None: super().__init__(lazy, **kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer(lowercase_tokens=True) } self._frontend = FrontEndReader.by_name(frontend_reader)( self, **frontend_args) self._concatenate_instances = concatenate_instances if self._concatenate_instances is not None and concatenate_frontend_reader is not None: self._concatenate_frontend = FrontEndReader.by_name( concatenate_frontend_reader)(self, **concatenate_frontend_args) self._sentence1_name = sentence1_name self._sentence2_name = sentence2_name
def __init__( self, tokens_per_instance: int = None, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, ) -> None: # Warn here so imports of unrelated models don't fail our tests. warnings.warn( "LanguageModelingReader is deprecated and not used by any core AllenNLP " "models. You almost certainly want to use " "SimpleLanguageModelingDatasetReader. It will be removed after 2020/01/04 " "in the version 1.0.0 release or later.", DeprecationWarning, ) super().__init__(lazy) self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._tokens_per_instance = tokens_per_instance # No matter how you want to represent the input, we'll always represent the output as a # single token id. This code lets you learn a language model that concatenates word # embeddings with character-level encoders, in order to predict the word token that comes # next. self._output_indexer: Dict[str, TokenIndexer] = None for name, indexer in self._token_indexers.items(): if isinstance(indexer, SingleIdTokenIndexer): self._output_indexer = {name: indexer} break else: self._output_indexer = {"tokens": SingleIdTokenIndexer()}
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_sequence_length: int = None, start_tokens: List[str] = None, end_tokens: List[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if max_sequence_length is not None: self._max_sequence_length: Union[ float, Optional[int]] = max_sequence_length else: self._max_sequence_length = math.inf self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])] logger.info("Creating SimpleLanguageModelingDatasetReader") logger.info("max_sequence_length=%s", max_sequence_length)