def __init__(self, filename: str, tokenizers: Dict[str, BaseTokenizer] = WordTokenizer()): super().__init__(filename, tokenizers) self.filename = filename self.tokenizers = tokenizers self.lines, self.labels = self.get_lines_labels()
def __init__(self, text: str, context: List[str], tokenizers: Dict[str, BaseTokenizer] = None): if tokenizers is None: tokenizers = {"tokens": WordTokenizer()} self.text = text self.context = context self.tokenizers = tokenizers self.tokens: Dict[str, List[Any]] = defaultdict(list) self.namespaces = list(tokenizers.keys()) for namespace in tokenizers.keys(): self.namespaces.append(f"contextual_{namespace}") # add tokens for the word tokens for namespace, tokenizer in self.tokenizers.items(): tokens = tokenizer.tokenize(text) for token in tokens: self.add_token(token=token, namespace=namespace) # add tokens for the contextual lines for namespace, tokenizer in self.tokenizers.items(): for contextual_line in self.context: tokens = tokenizer.tokenize(contextual_line) tokens = [Token(tok) for tok in tokens] self.tokens[f"contextual_{namespace}"].append(tokens) self.line = Line(text=text, tokenizers=self.tokenizers) self.context_lines = [] for text in self.context: context_line = Line(text=text, tokenizers=self.tokenizers) self.context_lines.append(context_line)
def test_sents_word_tokenizers(self): sents = ["Nice people", "Great weather"] sent = SeqSentence(sents=sents, tokenizers={"tokens": WordTokenizer()}) tokens = sent.tokens assert [[token.text for token in sent_tokens] for sent_tokens in tokens["tokens"]] == [["Nice", "people"], ["Great", "weather"]]
def test_get_lines_labels_len(self, test_file): dataset = CoNLLDataset(filename=test_file, tokenizers={"tokens": WordTokenizer()}) lines, labels = dataset.get_lines_labels() assert len(lines) == 1 assert len(labels) == 1
def test_spacy_whitespace_tokenizer(self): tokenizer = WordTokenizer(tokenizer="spacy-whitespace") tokenized = tokenizer.tokenize( "(1999). & P., W. The Control of Discrete Event Systems.") assert tokenized == [ "(1999).", "&", "P.,", "W.", "The", "Control", "of", "Discrete", "Event", "Systems.", ]
def get_tokenized_data(get_parsect_data): parsect_json = get_parsect_data parsect_lines = parsect_json["parse_sect"] parsect_lines = parsect_lines[:100] tokenizer = WordTokenizer() lines = [] labels = [] for line_json in parsect_lines: text = line_json["text"] label = line_json["label"] lines.append(text) labels.append(label) instances = tokenizer.tokenize_batch(lines) return instances, labels
def conll_yago_dataset(request): train_filename = DATA_DIR.joinpath(request.param) dataset = ConllYagoDataset( filename=str(train_filename), tokenizers={"tokens": WordTokenizer(tokenizer="vanilla")}, column_names=["NER"], ) return dataset
def setup_lines(): texts = ["First sentence", "Second Sentence"] lines = [] for text in texts: line = Line( text=text, tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()}, ) lines.append(line) return lines
def lines(): texts = ["First line", "Second Line which is longer"] lines = [] for text in texts: line = Line( text=text, tokenizers={"tokens": WordTokenizer(tokenizer="vanilla")} ) lines.append(line) return lines
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size: int = 10, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(tokenizer="vanilla"), "char_tokens": CharacterTokenizer(), } self.namespace_vocab_options = namespace_vocab_options or { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", } } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.namespace_numericalizer_map["seq_label"] = Numericalizer() self.batch_size = batch_size self.train_dataset = SeqLabellingDataset( filename=self.train_filename, tokenizers=self.tokenizers ) self.dev_dataset = SeqLabellingDataset( filename=self.dev_filename, tokenizers=self.tokenizers ) self.test_dataset = SeqLabellingDataset( filename=self.test_filename, tokenizers=self.tokenizers ) super(SeqLabellingDatasetManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def test_line_word_tokenizers(self): text = "This is a single line" line = Line(text=text, tokenizers={"tokens": WordTokenizer()}) tokens = line.tokens assert [token.text for token in tokens["tokens"]] == [ "This", "is", "a", "single", "line", ]
def test_get_item(self, test_file): dataset = SeqLabellingDataset(filename=str(test_file), tokenizers={"tokens": WordTokenizer()}) num_instances = len(dataset) for idx in range(num_instances): line, label = dataset[idx] word_tokens = line.tokens["tokens"] label_tokens = label.tokens["seq_label"] print(f"label tokens {label.tokens}") assert len(word_tokens) == len(label_tokens)
def __init__(self, sents: List[str], tokenizers: Dict[str, BaseTokenizer] = None): if tokenizers is None: tokenizers = {"tokens": WordTokenizer()} self.sents = sents self.tokenizers = tokenizers self.tokens: Dict[str, List[List[Any]]] = defaultdict(list) self.namespaces = list(tokenizers.keys()) for namespace, tokenizer in tokenizers.items(): for sent in sents: sent_tokens = tokenizer.tokenize(sent) self.add_sent_tokens(tokens=sent_tokens, namespace=namespace)
def __init__(self, text: str, tokenizers: Dict[str, BaseTokenizer] = None): if tokenizers is None: tokenizers = {"tokens": WordTokenizer()} self.text = text self.tokenizers = tokenizers self.tokens: Dict[str, List[Any]] = defaultdict(list) self.namespaces = list(tokenizers.keys()) for namespace, tokenizer in tokenizers.items(): tokens = tokenizer.tokenize(text) for token in tokens: self.add_token(token=token, namespace=namespace)
def test_get_item(self, test_file): classification_dataset = TextClassificationDataset( filename=str(test_file), tokenizers={"tokens": WordTokenizer()}) num_instances = len(classification_dataset) tokens = ["line1", "line2"] line_tokens = [] for idx in range(num_instances): line, label = classification_dataset[idx] line_tokens.extend(line.tokens["tokens"]) line_tokens = list(map(lambda token: token.text, line_tokens)) assert set(tokens) == set(line_tokens)
def test_restricted_namesapces(self, test_file, train_only): dataset = CoNLLDataset( filename=test_file, tokenizers={"tokens": WordTokenizer()}, column_names=["POS", "DEP", "NER"], train_only=train_only, ) lines, labels = dataset.get_lines_labels() for label in labels: namespaces = label.namespace assert len(namespaces) == 1 assert train_only.upper() in namespaces
def test_labels_namespaces(self, test_file): dataset = CoNLLDataset( filename=test_file, tokenizers={"tokens": WordTokenizer()}, column_names=["NER", "POS", "DEP"], ) lines, labels = dataset.get_lines_labels() for label in labels: namespaces = label.namespace assert len(namespaces) == 3 assert "NER" in namespaces assert "POS" in namespaces assert "DEP" in namespaces
def test_len_lines_labels_equal(self, test_file): dataset = CoNLLDataset( filename=test_file, tokenizers={"tokens": WordTokenizer()}, column_names=["NER", "POS", "DEP"], ) lines, labels = dataset.get_lines_labels() for line, label in zip(lines, labels): line_tokens = line.tokens["tokens"] labels_ner = label.tokens["NER"] labels_pos = label.tokens["POS"] labels_dep = label.tokens["DEP"] assert (len(line_tokens) == len(labels_ner) == len(labels_pos) == len(labels_dep))
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size: int = 10, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(), "char_tokens": CharacterTokenizer(), } self.namespace_vocab_options = namespace_vocab_options or { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", }, "label": { "include_special_vocab": False }, } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.namespace_numericalizer_map["label"] = Numericalizer() self.batch_size = batch_size self.train_dataset = TextClassificationDataset( filename=self.train_filename, tokenizers=self.tokenizers) self.dev_dataset = TextClassificationDataset( filename=self.dev_filename, tokenizers=self.tokenizers) self.test_dataset = TextClassificationDataset( filename=self.test_filename, tokenizers=self.tokenizers) super(TextClassificationDatasetManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def test_line_char_tokenizer(self): text = "Word" line = Line( text=text, tokenizers={ "tokens": WordTokenizer(), "chars": CharacterTokenizer() }, ) tokens = line.tokens word_tokens = tokens["tokens"] char_tokens = tokens["chars"] word_tokens = [tok.text for tok in word_tokens] char_tokens = [tok.text for tok in char_tokens] assert word_tokens == ["Word"] assert char_tokens == ["W", "o", "r", "d"]
def setup_char_embedder(request, clf_dataset_manager): char_embedding_dim, hidden_dim = request.param datset_manager = clf_dataset_manager embedder = CharEmbedder( char_embedding_dimension=char_embedding_dim, hidden_dimension=hidden_dim, datasets_manager=datset_manager, ) texts = ["This is sentence", "This is another sentence"] lines = [] for text in texts: line = Line( text=text, tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()}, ) lines.append(line) return embedder, lines
def test_get_item(self, test_file): summarization_dataset = AbstractiveSummarizationDataset( filename=str(test_file), tokenizers={"tokens": WordTokenizer()} ) num_instances = len(summarization_dataset) defined_line_tokens = ["word11_train", "word21_train", "word12_train", "word22_train", "word32_train"] defined_label_tokens = ["word11_label", "word12_label", "word21_label"] line_tokens = [] label_tokens = [] for idx in range(num_instances): line, label = summarization_dataset[idx] line_tokens.extend(line.tokens["tokens"]) label_tokens.extend(label.tokens["tokens"]) line_tokens = list(map(lambda token: token.text, line_tokens)) label_tokens = list(map(lambda token: token.text, label_tokens)) assert set(defined_line_tokens) == set(line_tokens) assert set(defined_label_tokens) == set(label_tokens)
def test_sents_char_tokenizer(self): sents = ["Hello", "World"] sent = SeqSentence( sents=sents, tokenizers={ "tokens": WordTokenizer(), "chars": CharacterTokenizer() }, ) tokens = sent.tokens word_tokens = tokens["tokens"] char_tokens = tokens["chars"] word_tokens = [[tok.text for tok in sent_word_tokens] for sent_word_tokens in word_tokens] char_tokens = [[tok.text for tok in sent_char_tokens] for sent_char_tokens in char_tokens] assert word_tokens == [["Hello"], ["World"]] assert char_tokens == [["H", "e", "l", "l", "o"], ["W", "o", "r", "l", "d"]]
def __init__( self, filename: str, dataset_type: str, max_num_words: int, max_instance_length: int, word_vocab_store_location: str, debug: bool = False, debug_dataset_proportion: float = 0.1, word_embedding_type: Union[str, None] = None, word_embedding_dimension: Union[int, None] = None, word_start_token: str = "<SOS>", word_end_token: str = "<EOS>", word_pad_token: str = "<PAD>", word_unk_token: str = "<UNK>", train_size: float = 0.8, test_size: float = 0.2, validation_size: float = 0.5, word_tokenizer=WordTokenizer(), word_tokenization_type="vanilla", ): """ Base Text Classification Dataset to be inherited by all text classification datasets Parameters ---------- filename : str Path of file where the text classification dataset is stored. Ideally this should have an example text and label separated by space. But it is left to the specific dataset to handle the different ways in which file could be structured dataset_type : str One of ``[train, valid, test]`` max_num_words : int The top ``max_num_words`` will be considered for building vocab max_instance_length : int Every instance in the dataset will be padded to or curtailed to ``max_length`` number of tokens word_vocab_store_location : str Vocabulary once built will be stored in this location If the vocabulary already exists then it will be loaded from the filepath debug : bool Useful to build a small dataset for debugging purposes. If ``True``, then a smaller random version of the dataset should be returned. If ``True`` then ``debug_dataset_proportion`` will be the proportion of the dataset that will be returned debug_dataset_proportion : int Percent of dataset that will be returned for debug purposes. Should be between 0 and 1 word_embedding_type : str The kind of word embedding that will be associated with the words in the database Any of the ``allowed_types`` in vocab.EmbeddingLoader is allowed here word_embedding_dimension : int Dimension of word embedding word_start_token : str Start token appended at the beginning of every instance word_end_token : str End token appended at the end of every instance word_pad_token : str Pad token to be used for padding word_unk_token : str All OOV words (if they are less frequent than ``max_words`` or word is in test but not in train) will be mapped to ``unk_token`` train_size : str Percentage of the instances to be used for training test_size : str Remaining percentage that will be used for testing validation_size : str Percentage of test data that will be used for validation word_tokenizer : WordTokenizer Word Tokenizer to be used for the dataset. You can reference ``tokenizers.WordTokenizer`` for more information word_tokenization_type : str The type of word tokenization that the word tokenizer represents """ pass
def test_len(self, test_file): dataset = SeqLabellingDataset(filename=str(test_file), tokenizers={"tokens": WordTokenizer()}) assert len(dataset) == 2
def test_get_lines_labels(self, test_file): dataset = SeqLabellingDataset(filename=str(test_file), tokenizers={"tokens": WordTokenizer()}) lines, labels = dataset.get_lines_labels() assert len(lines) == 2
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size=10, column_names: List[str] = None, train_only: Optional[str] = None, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(tokenizer="vanilla"), "char_tokens": CharacterTokenizer(), } if namespace_vocab_options is None: namespace_vocab_options = {} namespace_vocab_options_defaults = { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", } } self.namespace_vocab_options = {} vocab_namespaces = set(namespace_vocab_options.keys()).union( namespace_vocab_options_defaults.keys()) for namespace in vocab_namespaces: user_passed = namespace_vocab_options.get(namespace, {}) defaults = namespace_vocab_options_defaults.get(namespace, {}) self.namespace_vocab_options[namespace] = { **defaults, **user_passed } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.batch_size = batch_size if column_names is None: column_names = ["label_1"] valid_column_names = [column_names[0]] for column_name in valid_column_names: self.namespace_numericalizer_map[column_name] = Numericalizer() self.train_dataset = BioNerDataset( filename=self.train_filename, tokenizers=self.tokenizers, column_names=column_names, train_only=train_only, ) self.dev_dataset = BioNerDataset( filename=self.dev_filename, tokenizers=self.tokenizers, column_names=column_names, train_only=train_only, ) self.test_dataset = BioNerDataset( filename=self.test_filename, tokenizers=self.tokenizers, column_names=column_names, train_only=train_only, ) super(BioNERDatasetManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def __call__(self, wrapped, instance, args, kwargs): self.wrapped_cls = wrapped self.init_signature = inspect.signature(wrapped.__init__) instance = wrapped(*args, **kwargs) for idx, (name, param) in enumerate(self.init_signature.parameters.items()): if name == "self": continue # These are values that must be passed if name in [ "filename", "dataset_type", "max_num_words", "max_instance_length", "word_vocab_store_location", ]: try: value = args[idx] except IndexError: try: value = kwargs[name] except KeyError: raise ValueError( f"Dataset {self.cls.__name__} should be instantiated with {name}" ) if self.autoset_attrs: setattr(instance, name, value) setattr(self, name, value) # These can be passed but have default values else: try: value = args[idx] except IndexError: try: value = kwargs[name] except KeyError: value = param.default if self.autoset_attrs: setattr(instance, name, value) setattr(self, name, value) # set the lines and labels self.lines, self.labels = instance.get_lines_labels(self.filename) self.word_instances = None self.word_vocab = None if "word_vocab" in self.vocab_pipe: self.word_tokenizer = WordTokenizer(self.word_tokenization_type) self.set_word_vocab() instance.word_tokenizer = self.word_tokenizer instance.word_numericalizer = self.word_numericalizer instance.word_vocab = copy.deepcopy(self.word_vocab) instance.word_instances = copy.deepcopy(self.word_instances) instance.num_instances = len(self.word_instances) instance.instance_max_len = max( [len(instance) for instance in self.word_instances] ) if "char_vocab" in self.vocab_pipe: self.char_tokenizer = CharacterTokenizer() self.set_char_vocab() instance.char_vocab = copy.deepcopy(self.char_vocab) instance.char_instances = copy.deepcopy(self.char_instances) instance.char_tokenizer = self.char_tokenizer instance.char_numericalizer = self.char_numericalizer if self.is_get_label_stats_table: label_stats_table = self._get_label_stats_table() instance.label_stats_table = label_stats_table return instance
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size=10, column_names: List[str] = None, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(tokenizer="vanilla"), "char_tokens": CharacterTokenizer(), } namespace_vocab_options_defaults = { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", } } if namespace_vocab_options is None: namespace_vocab_options = {} self.namespace_vocab_options = copy.deepcopy( namespace_vocab_options_defaults) for namespace, options in self.namespace_vocab_options.items(): user_passed = namespace_vocab_options.get(namespace, {}) self.namespace_vocab_options[namespace] = { **options, **user_passed } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.batch_size = batch_size if column_names is None: column_names = ["NER"] for column_name in column_names: self.namespace_numericalizer_map[column_name] = Numericalizer() self.train_dataset = ConllYagoDataset( filename=self.train_filename, tokenizers=self.tokenizers, column_names=column_names, ) self.dev_dataset = ConllYagoDataset( filename=self.dev_filename, tokenizers=self.tokenizers, column_names=column_names, ) self.test_dataset = ConllYagoDataset( filename=self.test_filename, tokenizers=self.tokenizers, column_names=column_names, ) super(ConllYagoDatasetsManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def test_get_item(self, test_file): dataset = ExtractiveSummarizationDataset( filename=str(test_file), tokenizers={"tokens": WordTokenizer()}) doc0, label0, ref0 = dataset[0] assert len(doc0) == len(label0.tokens["seq_label"])