Exemplo n.º 1
0
 def test_handles_byte_encoding(self):
     tokenizer = CharacterTokenizer(byte_encoding='utf-8', start_tokens=[259], end_tokens=[260])
     word = "åøâáabe"
     tokens = [t.text_id for t in tokenizer.tokenize(word)]
     # Note that we've added one to the utf-8 encoded bytes, to account for masking.
     expected_tokens = [259, 196, 166, 196, 185, 196, 163, 196, 162, 98, 99, 102, 260]
     assert tokens == expected_tokens
Exemplo n.º 2
0
 def test_splits_into_characters(self):
     tokenizer = CharacterTokenizer(start_tokens=['<S1>', '<S2>'], end_tokens=['</S2>', '</S1>'])
     sentence = "A, small sentence."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = ["<S1>", "<S2>", "A", ",", " ", "s", "m", "a", "l", "l", " ", "s", "e",
                        "n", "t", "e", "n", "c", "e", ".", '</S2>', '</S1>']
     assert tokens == expected_tokens
Exemplo n.º 3
0
 def test_non_word_ending_suffix(self):
     tokenizer = CharacterTokenizer(non_word_end_suffix="##")
     sent = "A, small sentence."
     tokens = [t.text for t in tokenizer.tokenize(sent)]
     expected_tokens = [
         "A##", ",", "s##", "m##", "a##", "l##", "l", "s##", "e##", "n##",
         "t##", "e##", "n##", "c##", "e##", "."
     ]
     assert tokens == expected_tokens
 def test_splits_into_characters(self):
     tokenizer = CharacterTokenizer(start_tokens=[u'<S1>', u'<S2>'],
                                    end_tokens=[u'</S2>', u'</S1>'])
     sentence = u"A, small sentence."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = [
         u"<S1>", u"<S2>", u"A", u",", u" ", u"s", u"m", u"a", u"l", u"l",
         u" ", u"s", u"e", u"n", u"t", u"e", u"n", u"c", u"e", u".",
         u'</S2>', u'</S1>'
     ]
     assert tokens == expected_tokens
Exemplo n.º 5
0
 def test_to_params(self):
     tokenizer = CharacterTokenizer(byte_encoding="utf-8",
                                    start_tokens=[259],
                                    end_tokens=[260])
     params = tokenizer.to_params()
     assert isinstance(params, Params)
     assert params.params == {
         "type": "character",
         "byte_encoding": "utf-8",
         "end_tokens": [260],
         "start_tokens": [259],
         "lowercase_characters": False,
     }
Exemplo n.º 6
0
def read_dataset(all_chars: Set[str]=None) -> List[List[Token]]:
    tokenizer = CharacterTokenizer()
    sentences = []
    with open('data/mt/sentences.eng.10k.txt') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            line = re.sub(' +', ' ', line)
            tokens = tokenizer.tokenize(line)
            if all_chars:
                tokens = [token for token in tokens if token.text in all_chars]
            sentences.append(tokens)

    return sentences
Exemplo n.º 7
0
    def __init__(
        self,
        bpps_dir: Optional[str] = None,
        max_sequence_length: Optional[int] = None,
        add_start_end_tokens: bool = True,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)

        self._bpps_dir = bpps_dir if bpps_dir is None else Path(bpps_dir)
        self._max_sequence_length = max_sequence_length or math.inf
        self._add_start_end_tokens = add_start_end_tokens
        self._tokenizer = CharacterTokenizer()
        self._start_token = Token(START_TOKEN)
        self._end_token = Token(END_TOKEN)
Exemplo n.º 8
0
    def __init__(self, model_path, top_k=3, cuda_device=-1):
        archive = load_archive(model_path, cuda_device=cuda_device)

        config = archive.config
        prepare_environment(config)
        model = archive.model
        model.eval()

        self.model = model

        self._tokenizer = CharacterTokenizer()
        self._token_indexers = {'tokens': SingleIdTokenIndexer()}
        self._id_to_label = model.vocab.get_index_to_token_vocabulary(
            namespace='labels')
        self._top_k = top_k
Exemplo n.º 9
0
class SentenceSegmentPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._character_tokenizer = CharacterTokenizer()

    def predict(self, sentence: str) -> JsonDict:
        results = self.predict_json({"sentence": sentence})
        return results

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict['sentence']
        character_tokens = self._character_tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance(character_tokens)

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        instance = self._json_to_instance(inputs)
        result = self.predict_instance(instance)
        sentence = inputs['sentence']
        tags = result['tags']
        segment: List = []
        seg_idx = -1
        for idx, word, in enumerate(sentence):
            if tags[idx] == 'S' or tags[idx] == 'B':
                segment.append(word)
                seg_idx += 1
            else:
                segment[seg_idx] += word
        result['segment'] = segment
        return result
Exemplo n.º 10
0
 def __init__(self,
              cache_path: Union[str, Path],
              audio_transformer: AudioTransformer = None,
              feature_name: str = 'mag',
              force: bool = False,
              lazy: bool = False) -> None:
     super(VCTK, self).__init__(lazy)
     self.cache_path = Path(cache_path)
     self.audio_transformer = audio_transformer or MelSpectrogram()
     self.feature_name = feature_name
     self.tokenizer = CharacterTokenizer()
     self.token_indexers = {
         'character': SingleIdTokenIndexer(namespace='character')
     }
     if force and self.cache_path.exists():
         self.cache_path.unlink()
    def __init__(
        self,
        namespace: str = "token_characters",
        character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        min_padding_length: int = 0,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        if min_padding_length == 0:
            url = "https://github.com/allenai/allennlp/issues/1954"
            warnings.warn(
                "You are using the default value (0) of `min_padding_length`, "
                f"which can cause some subtle bugs (more info see {url}). "
                "Strongly recommend to set a value, usually the maximum size "
                "of the convolutional layer size when using CnnEncoder.",
                UserWarning,
            )
        self._min_padding_length = min_padding_length
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
Exemplo n.º 12
0
 def __init__(self,
              source_tokenizer: Tokenizer = None,
              target_tokenizer: Tokenizer = None,
              source_token_indexer: Dict[str, TokenIndexer] = None,
              target_token_indexer: Dict[str, TokenIndexer] = None,
              max_tokens: int = None,
              target_add_start_token: bool = True,
              target_add_end_token: bool = True,
              end_symbol: str = END_SYMBOL,
              start_symbol: str = START_SYMBOL,
              **kwargs) -> None:
     super().__init__(**kwargs)
     self._source_tokenizer = source_tokenizer or CharacterTokenizer(
         start_tokens=[
             START_SYMBOL,
         ],
         end_tokens=[
             END_SYMBOL,
         ],
         lowercase_characters=True)
     self._target_tokenizer = target_tokenizer or WhitespaceTokenizer()
     self._source_token_indexer = source_token_indexer or {
         'tokens': SingleIdTokenIndexer()
     }
     self._target_token_indexer = target_token_indexer or {
         'tokens': SingleIdTokenIndexer()
     }
     self._max_tokens = max_tokens
     self._start_symbol = start_symbol
     self._end_symbol = end_symbol
     self._target_add_start_token = target_add_start_token
     self._target_add_end_token = target_add_end_token
 def test_batch_tokenization(self):
     tokenizer = CharacterTokenizer()
     sentences = [
         "This is a sentence", "This isn't a sentence.",
         "This is the 3rd sentence."
         "Here's the 'fourth' sentence."
     ]
     batch_tokenized = tokenizer.batch_tokenize(sentences)
     separately_tokenized = [
         tokenizer.tokenize(sentence) for sentence in sentences
     ]
     assert len(batch_tokenized) == len(separately_tokenized)
     for batch_sentence, separate_sentence in zip(batch_tokenized,
                                                  separately_tokenized):
         assert len(batch_sentence) == len(separate_sentence)
         for batch_word, separate_word in zip(batch_sentence,
                                              separate_sentence):
             assert batch_word.text == separate_word.text
Exemplo n.º 14
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or CharacterTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': TokenCharactersIndexer()
     }
    def __init__(self,
                 tokenizer=None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer or CharacterTokenizer()

        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
Exemplo n.º 16
0
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_token_len: int = 512):
        super().__init__(lazy)

        self._tokenizer = tokenizer or CharacterTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self._max_token_len = max_token_len
Exemplo n.º 17
0
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or CharacterTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        relation_vocab_path = './raw_data/chinese/relation_vocab.json'
        self.relation_vocab = json.load(open(relation_vocab_path, 'r'))
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     tokenizer: Tokenizer = None,
     max_tokens: int = None,
     **kwargs,
 ):
     super().__init__(**kwargs)
     self.tokenizer = tokenizer or CharacterTokenizer()
     self.token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self.max_tokens = max_tokens
Exemplo n.º 19
0
    def __init__(self,
                 lazy: bool = False,
                 max_sequence_length: int = 256,
                 tokenizer: Tokenizer = None,
                 token_indexer: TokenIndexer = None):
        super().__init__(lazy)

        self._character_tokenizer = tokenizer or CharacterTokenizer()
        self._token_indexer = token_indexer or {
            'tokens': SingleIdTokenIndexer()
        }
        self._max_sequence_length = max_sequence_length
        self._tags = ['B', 'M', 'E', 'S']
Exemplo n.º 20
0
def construct_reader(is_pretrain):
    character_tokenizer = CharacterTokenizer(byte_encoding="utf-8",
                                             start_tokens=[259],
                                             end_tokens=[260])
    token_character_indexer = TokenCharactersIndexer(character_tokenizer=character_tokenizer,
                                                     min_padding_length=5)
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    reader = FollowUpDataReader(token_indexer={
        "token_words": token_indexer
    }, char_indexer={
        "token_characters": token_character_indexer,
    }, is_pretrain=is_pretrain)
    return reader
Exemplo n.º 21
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)

        self._tok = tokenizer or CharacterTokenizer()
        if 'bert' in token_indexers:
            wordpiece_tok = token_indexers['bert'].wordpiece_tokenizer
            token_indexers['bert'].wordpiece_tokenizer = \
             lambda s: ['[UNK]'] if s.isspace() else wordpiece_tok(s)
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
Exemplo n.º 22
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        tokenizer: Tokenizer = CharacterTokenizer(),
        max_sequence_length: Optional[int] = None,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy)
        self.token_indexers = token_indexers
        self.tokenizer = tokenizer
        self._max_sequence_length = max_sequence_length or math.inf

        logger.info("Creating LevenshteinReader")
        logger.info("max_sequence_length=%s", max_sequence_length)
Exemplo n.º 23
0
class NamesDatasetReader(DatasetReader):
    def __init__(self, lazy: bool = False) -> None:

        super().__init__(lazy)
        self._c_tokenizer = CharacterTokenizer()

        self._c_token_indexers = {"tokens": SingleIdTokenIndexer()}

    @overrides
    def text_to_instance(self, language: str,
                         name: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        tokenized_name = self._c_tokenizer.tokenize(name)

        name_field = TextField(tokenized_name, self._c_token_indexers)

        fields = {'name': name_field}

        if language is not None:
            print('language', language)
            fields['label'] = LabelField(language)

        return Instance(fields)

    @overrides
    def _read(self, file_path: str = './data/names/*.txt'):

        all_filenames = glob.glob('./data/names/*.txt')

        category_lines = {}
        all_categories = []

        for filename in all_filenames:
            category = filename.split('/')[-1].split('.')[0]
            all_categories.append(category)
            lines = readLines(filename)
            category_lines[category] = lines

        def random_training_pair():
            category = random.choice(all_categories)
            line = random.choice(category_lines[category])
            return category, line

        length_dict = sum(len(value) for key, value in category_lines.items())

        for i in range(len(category_lines)):
            language, name = random_training_pair()
            yield self.text_to_instance(language, name)
Exemplo n.º 24
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 padding_length: int = None,
                 char_level: bool = False) -> None:
        super().__init__(lazy=False)

        self._padding_length = padding_length
        self._tokenizer = tokenizer or (CharacterTokenizer()
                                        if char_level else WordTokenizer())
        self._token_indexers = token_indexers or {
            "tokens":
            SingleIdTokenIndexer(
                token_min_padding_length=(self._padding_length or 0))
        }
Exemplo n.º 25
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              max_tokens: int = None,
              **kwargs) -> None:
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or CharacterTokenizer(start_tokens=[
         START_SYMBOL,
     ],
                                                       end_tokens=[
                                                           END_SYMBOL,
                                                       ])
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._max_tokens = max_tokens
Exemplo n.º 26
0
 def from_params(cls, params: Params) -> 'TokenCharactersIndexer':
     """
     Parameters
     ----------
     namespace : ``str``, optional (default=``token_characters``)
         We will use this namespace in the :class:`Vocabulary` to map the characters in each token
         to indices.
     character_tokenizer : ``Params``, optional (default=``Params({})``)
         We use a :class:`CharacterTokenizer` to handle splitting tokens into characters, as it has
         options for byte encoding and other things.  These parameters get passed to the character
         tokenizer.  The default is to use unicode characters and to retain casing.
     """
     namespace = params.pop('namespace', 'token_characters')
     character_tokenizer_params = params.pop('character_tokenizer', {})
     character_tokenizer = CharacterTokenizer.from_params(character_tokenizer_params)
     params.assert_empty(cls.__name__)
     return cls(namespace=namespace, character_tokenizer=character_tokenizer)
Exemplo n.º 27
0
    def __init__(self,
                 tokenizer: Tokenizer,
                 token_indexers: Dict[str, TokenIndexer],
                 combine_input_fields: Optional[bool] = False,
                 add_special_symbols: Optional[bool] = False,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or CharacterTokenizer(
            lowercase_characters=True)
        self._token_indexer = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        self._combine_input_fields = combine_input_fields
        self._add_special_symbols = add_special_symbols

        self._start_symbol = START_SYMBOL
        self._end_symbol = END_SYMBOL
def construct_reader():
    from data_reader.dialogue_reader import FollowUpDataReader
    character_tokenizer = CharacterTokenizer(byte_encoding="utf-8",
                                             start_tokens=[259],
                                             end_tokens=[260])
    token_character_indexer = TokenCharactersIndexer(
        character_tokenizer=character_tokenizer, min_padding_length=5)
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    reader = FollowUpDataReader(
        token_indexer={
            # "elmo": elmo_indexer,
            "token_words": token_indexer
        },
        char_indexer={
            "token_characters": token_character_indexer,
        },
        is_pretrain=True)
    return reader
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
    def prepare_dataset_reader(args):
        assert args.transformer is not None
        token_indexers = {
            'transformer':
            TransformerIndexer(
                model_name=args.transformer,
                use_starting_offsets=True,
                truncate_long_sequences=True,
                max_pieces=510,
                do_lowercase="uncased" in args.transformer,
            )
        }

        tokenizer = CharacterTokenizer() if args.character else None

        dataset_reader = TextClassificationReader(
            tokenizer=tokenizer,
            token_indexers=token_indexers,
            max_length=args.max_length,
            multi_label=args.classification_type == 'bce',
        )

        return dataset_reader