Exemplo n.º 1
0
 def __init__(self):
     """
     Transliteration of Thai words
     Now supports Thai to Latin (romanization)
     """
     # Download the model, if it's not on your machine.
     self.__filemodel = get_corpus_path("thai2rom-pytorch")
     if not self.__filemodel:
         download("thai2rom-pytorch")
         self.__filemodel = get_corpus_path("thai2rom-pytorch")
     loader = torch.load(self.__filemodel)
     self._n_h = 64  # hidden dimensions for encoder
     self._n_s = 64  # hidden dimensions for decoder
     self._emb_dim = 64  # character embedding size
     self._maxlength = 100
     self._char_to_ix = loader['char_to_ix']
     self._ix_to_char = loader['ix_to_char']
     self._target_char_to_ix = loader['target_char_to_ix']
     self._ix_to_target_char = loader['ix_to_target_char']
     # encoder/ decoder
     # Restore the model and construct the encoder and decoder.
     self._encoder = Encoder(len(self._char_to_ix), self._n_h,
                             self._emb_dim).to(device)
     self._encoder.load_state_dict(loader['encoder_state_dict'])
     self._decoder = OneStepDecoder(len(self._target_char_to_ix), self._n_s,
                                    self._emb_dim).to(device)
     self._decoder.load_state_dict(loader['decoder_state_dict'])
Exemplo n.º 2
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(provinces(details=True), list)
        self.assertEqual(len(provinces(details=False)),
                         len(provinces(details=True)))
        self.assertIsInstance(thai_family_names(), frozenset)
        self.assertIsInstance(list(thai_family_names())[0], str)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertIsInstance(
            get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"),
            Response,
        )  # URL does not exist, should get 404 response
        self.assertIsNone(get_corpus_db("XXXlkja3sfdXX"))  # Invalid URL

        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"),
                         {})  # corpus does not exist
        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"),
                         {})  # corpus does not exist

        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertIsNotNone(get_corpus_path("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertIsNone(get_corpus_path("XXXkdjfBzc"))  # query non-existing
        self.assertFalse(download(name="test", version="0.0"))
        self.assertFalse(download(name="test", version="0.0.0"))
        self.assertFalse(download(name="test", version="0.0.1"))
        self.assertFalse(download(name="test", version="0.0.2"))
        self.assertFalse(download(name="test", version="0.0.3"))
        self.assertFalse(download(name="test", version="0.0.4"))
        self.assertIsNotNone(download(name="test", version="0.0.5"))
        self.assertTrue(download("test"))
        self.assertIsNotNone(remove("test"))  # remove existing
        self.assertIsNotNone(download(name="test", version="0.0.6"))
        self.assertIsNotNone(download(name="test", version="0.0.7"))
        self.assertIsNotNone(download(name="test", version="0.0.8"))
        self.assertIsNotNone(download(name="test", version="0.0.9"))
        self.assertIsNotNone(download(name="test", version="0.0.10"))
        with self.assertRaises(Exception) as context:
            self.assertIsNotNone(download(name="test", version="0.0.11"))
        self.assertTrue(
            "Hash does not match expected." in str(context.exception))
        self.assertIsNotNone(download(name="test", version="0.1"))
        self.assertIsNotNone(remove("test"))
Exemplo n.º 3
0
def _download_install(name: str) -> None:
    if get_corpus_path(name) is None:
        download(name, force=True, version="1.0")
        tar = tarfile.open(get_corpus_path(name), "r:gz")
        tar.extractall()
        tar.close()
    if not os.path.exists(get_full_data_path(name)):
        os.mkdir(get_full_data_path(name))
        with tarfile.open(get_corpus_path(name)) as tar:
            tar.extractall(path=get_full_data_path(name))
Exemplo n.º 4
0
 def __init__(self):
     super().__init__()
     self.graphemes = hp.graphemes
     self.phonemes = hp.phonemes
     self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab()
     self.checkpoint = get_corpus_path(_MODEL_NAME)
     if self.checkpoint is None:
         download(_MODEL_NAME)
         self.checkpoint = get_corpus_path(_MODEL_NAME)
     self._load_variables()
Exemplo n.º 5
0
def _get_path(fname: str) -> str:
    """
    :meth: download get path of file from pythainlp-corpus
    :param str fname: file name
    :return: path to downloaded file
    """
    path = get_corpus_path(fname)
    if not path:
        download(fname)
        path = get_corpus_path(fname)
    return path
Exemplo n.º 6
0
def _get_path(fname: str) -> str:
    """
    :meth: download get path of file from pythainlp-corpus
    :param str fname: file name
    :return: path to downloaded file
    """
    path = get_corpus_path(fname)
    if not path:
        download(fname)
        path = get_corpus_path(fname)
    return path
Exemplo n.º 7
0
def _lst20_tagger():
    global _LST20_TAGGER
    if not _LST20_TAGGER:
        path = get_corpus_path(_LST20_TAGGER_NAME)
        with open(path, "rb") as fh:
            _LST20_TAGGER = pickle.load(fh)
    return _LST20_TAGGER
Exemplo n.º 8
0
 def __init__(self):
     """
     Thai named-entity recognizer
     """
     self.__data_path = get_corpus_path("thainer")
     if not self.__data_path:
         download("thainer")
         self.__data_path = get_corpus_path("thainer")
     self.crf = sklearn_crfsuite.CRF(
         algorithm="lbfgs",
         c1=0.1,
         c2=0.1,
         max_iterations=500,
         all_possible_transitions=True,
         model_filename=self.__data_path,
     )
    def __init__(self, version: str = "1.5") -> None:
        """
        Thai named-entity recognizer.

        :param str version: Thai NER version.
                            It's support Thai NER 1.4 & 1.5.
                            The defualt value is `1.5`
        """
        self.crf = CRFTagger()

        if version == "1.4":
            self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
            self.pos_tag_name = "orchid_ud"
        else:
            self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5"))
            self.pos_tag_name = "lst20"
Exemplo n.º 10
0
    def __init__(self):
        # get the model, will download if it's not available locally
        self.__model_filename = get_corpus_path(_MODEL_NAME)

        loader = torch.load(self.__model_filename, map_location=device)

        INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT = loader["encoder_params"]
        OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT = loader["decoder_params"]

        self._maxlength = 100

        self._char_to_ix = loader["char_to_ix"]
        self._ix_to_char = loader["ix_to_char"]
        self._target_char_to_ix = loader["target_char_to_ix"]
        self._ix_to_target_char = loader["ix_to_target_char"]

        # encoder/ decoder
        # Restore the model and construct the encoder and decoder.
        self._encoder = Encoder(INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT)

        self._decoder = AttentionDecoder(OUTPUT_DIM, D_EMB_DIM, D_HID_DIM,
                                         D_DROPOUT)

        self._network = Seq2Seq(
            self._encoder,
            self._decoder,
            self._target_char_to_ix["<start>"],
            self._target_char_to_ix["<end>"],
            self._maxlength,
        ).to(device)

        self._network.load_state_dict(loader["model_state_dict"])
        self._network.eval()
Exemplo n.º 11
0
 def __init__(self):
     """
     Thai named-entity recognizer
     """
     self.__data_path = get_corpus_path("thainer-1-3")
     if not self.__data_path:
         download("thainer-1-3")
         self.__data_path = get_corpus_path("thainer-1-3")
     self.crf = sklearn_crfsuite.CRF(
         algorithm="lbfgs",
         c1=0.1,
         c2=0.1,
         max_iterations=500,
         all_possible_transitions=True,
         model_filename=self.__data_path,
     )
Exemplo n.º 12
0
def _lst20_tagger():
    global _LST20_TAGGER
    if not _LST20_TAGGER:
        path = get_corpus_path(_LST20_TAGGER_NAME)
        with open(path, encoding="utf-8-sig") as fh:
            _LST20_TAGGER = json.load(fh)
    return _LST20_TAGGER
Exemplo n.º 13
0
def get_model() -> Word2VecKeyedVectors:
    """
    Get word vector model.

    :return: `gensim` word2vec model
    :rtype: gensim.models.keyedvectors.Word2VecKeyedVectors
    """
    path = get_corpus_path(_MODEL_NAME)
    return KeyedVectors.load_word2vec_format(path, binary=True)
Exemplo n.º 14
0
def trigram_word_freqs() -> defaultdict:
    """
    Get trigram word frequency from Thai National Corpus (TNC)
    """
    _path = get_corpus_path(_TRIGRAM)
    _word_freqs = defaultdict(int)
    with open(_path, "r", encoding="utf-8-sig") as fh:
        for i in fh.readlines():
            _temp = i.strip().split("	")
            _word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])

    return _word_freqs
Exemplo n.º 15
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        # Download the model, if it's not on your machine.
        self.__filemodel = get_corpus_path("thai2rom-pytorch-attn")
        if not self.__filemodel:
            download("thai2rom-pytorch-attn")
            self.__filemodel = get_corpus_path("thai2rom-pytorch-attn")

        loader = torch.load(self.__filemodel, map_location=device)

        INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT = loader["encoder_params"]
        OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT = loader["decoder_params"]

        self._maxlength = 100

        self._char_to_ix = loader["char_to_ix"]
        self._ix_to_char = loader["ix_to_char"]
        self._target_char_to_ix = loader["target_char_to_ix"]
        self._ix_to_target_char = loader["ix_to_target_char"]

        # encoder/ decoder
        # Restore the model and construct the encoder and decoder.
        self._encoder = Encoder(INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT)

        self._decoder = AttentionDecoder(OUTPUT_DIM, D_EMB_DIM, D_HID_DIM,
                                         D_DROPOUT)

        self._network = Seq2Seq(
            self._encoder,
            self._decoder,
            self._target_char_to_ix["<start>"],
            self._target_char_to_ix["<end>"],
            self._maxlength,
        ).to(device)

        self._network.load_state_dict(loader["model_state_dict"])
        self._network.eval()
Exemplo n.º 16
0
def get_model() -> Word2VecKeyedVectors:
    """
    **DEPRECATED: use WordVector class instead**

    Get word vector model.

    :return: `gensim` word2vec model
    :rtype: gensim.models.keyedvectors.Word2VecKeyedVectors
    """
    warnings.warn(
        "get_model is deprecated, use WordVector class instead",
        DeprecationWarning,
    )
    path = get_corpus_path(_MODEL_NAME)
    return KeyedVectors.load_word2vec_format(path, binary=True)
Exemplo n.º 17
0
def unigram_word_freqs() -> defaultdict:
    """
    Get unigram word frequency from OSCAR Corpus (icu word tokenize)
    """
    _path = get_corpus_path(_FILENAME)
    _word_freqs = defaultdict(int)
    with open(_path, "r", encoding="utf-8-sig") as fh:
        _data = [i for i in fh.readlines()]
        del _data[0]
        for i in _data:
            _temp = i.strip().split(",")
            if _temp[0] != " " and '"' not in _temp[0]:
                _word_freqs[_temp[0]] = int(_temp[-1])
            elif _temp[0] == " ":
                _word_freqs["<s/>"] = int(_temp[-1])

    return _word_freqs
Exemplo n.º 18
0
def word_freqs() -> List[Tuple[str, int]]:
    """
    Get word frequency from OSCAR Corpus (icu word tokenize)
    """
    word_freqs = []
    _path = get_corpus_path(_FILENAME)
    with open(_path, "r", encoding="utf-8") as f:
        _data = [i for i in f.readlines()]
        del _data[0]
        for line in _data:
            _temp = line.strip().split(",")
            if len(_temp) >= 2:
                if _temp[0] != " " and '"' not in _temp[0]:
                    word_freqs.append((_temp[0], int(_temp[1])))
                elif _temp[0] == " ":
                    word_freqs.append(("<s/>", int(_temp[1])))

    return word_freqs
Exemplo n.º 19
0
    def load_wordvector(self, model_name: str):
        """
        Load word vector model.

        :param str model_name: model name
        """
        self.model_name = model_name
        self.model = KeyedVectors.load_word2vec_format(
            get_corpus_path(self.model_name),
            binary=True,
            unicode_errors="ignore"
        )
        self.WV_DIM = self.model.vector_size

        if self.model_name == "thai2fit_wv":
            self.tokenize = THAI2FIT_TOKENIZER.word_tokenize
        else:
            self.tokenize = word_tokenize
Exemplo n.º 20
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        self.__batch_size = 64
        self.__epochs = 100
        self.__latent_dim = 256
        self.__num_samples = 648241
        self.__data_path = get_corpus_path("thai2rom-dataset")
        if not self.__data_path:
            download("thai2rom-dataset")
            self.__data_path = get_corpus_path("thai2rom-dataset")

        self.__input_texts = []
        self.__target_texts = []
        self.__input_characters = set()
        self.__target_characters = set()

        with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh:
            self.__lines = self.__fh.read().split("\n")

        for line in self.__lines[: min(self.__num_samples, len(self.__lines) - 1)]:
            input_text, target_text = line.split("\t")
            if len(input_text) < 30 and len(target_text) < 90:
                target_text = "\t" + target_text + "\n"
                self.__input_texts.append(input_text)
                self.__target_texts.append(target_text)
                for char in input_text:
                    if char not in self.__input_characters:
                        self.__input_characters.add(char)
                for char in target_text:
                    if char not in self.__target_characters:
                        self.__target_characters.add(char)

        self.__input_characters = sorted(list(self.__input_characters))
        self.__target_characters = sorted(list(self.__target_characters))
        self.__num_encoder_tokens = len(self.__input_characters)
        self.__num_decoder_tokens = len(self.__target_characters)
        self.__max_encoder_seq_length = max([len(text) for text in self.__input_texts])
        self.__max_decoder_seq_length = max([len(text) for text in self.__target_texts])
        """print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)"""
        self.__input_token_index = dict(
            [(char, i) for i, char in enumerate(self.__input_characters)]
        )
        self.__target_token_index = dict(
            [(char, i) for i, char in enumerate(self.__target_characters)]
        )
        self.__encoder_input_data = np.zeros(
            (
                len(self.__input_texts),
                self.__max_encoder_seq_length,
                self.__num_encoder_tokens,
            ),
            dtype="float32",
        )
        for i, input_text in enumerate(self.__input_texts):
            for t, char in enumerate(input_text):
                self.__encoder_input_data[i, t, self.__input_token_index[char]] = 1.

        # Restore the model and construct the encoder and decoder.
        self.__filemodel = get_corpus_path("thai2rom")
        if not self.__filemodel:
            download("thai2rom")
            self.__filemodel = get_corpus_path("thai2rom")
        self.__model = load_model(self.__filemodel)
        self.__encoder_inputs = self.__model.input[0]  # input_1
        self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[
            2
        ].output  # lstm_1
        self.__encoder_states = [self.__state_h_enc, self.__state_c_enc]
        self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states)
        self.__decoder_inputs = self.__model.input[1]  # input_2
        self.__decoder_state_input_h = Input(shape=(self.__latent_dim,), name="input_3")
        self.__decoder_state_input_c = Input(shape=(self.__latent_dim,), name="input_4")
        self.__decoder_states_inputs = [
            self.__decoder_state_input_h,
            self.__decoder_state_input_c,
        ]
        self.__decoder_lstm = self.__model.layers[3]
        self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm(
            self.__decoder_inputs, initial_state=self.__decoder_states_inputs
        )
        self.__decoder_states = [self.__state_h_dec, self.__state_c_dec]
        self.__decoder_dense = self.__model.layers[4]
        self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs)
        self.__decoder_model = Model(
            [self.__decoder_inputs] + self.__decoder_states_inputs,
            [self.__decoder_outputs] + self.__decoder_states,
        )

        self.__reverse_input_char_index = dict(
            (i, char) for char, i in self.__input_token_index.items()
        )
        self.__reverse_target_char_index = dict(
            (i, char) for char, i in self.__target_token_index.items()
        )
Exemplo n.º 21
0
def _download() -> str:
    path = get_corpus_path("thai2fit_wv")
    if not path:
        download_data("thai2fit_wv")
        path = get_corpus_path("thai2fit_wv")
    return path
Exemplo n.º 22
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        self.__input_token_index = {
            ' ': 0,
            '!': 1,
            '"': 2,
            '(': 3,
            ')': 4,
            '-': 5,
            '.': 6,
            '0': 7,
            '1': 8,
            '2': 9,
            '3': 10,
            '4': 11,
            '5': 12,
            '6': 13,
            '7': 14,
            '8': 15,
            '9': 16,
            '\xa0': 17,
            'ก': 18,
            'ข': 19,
            'ฃ': 20,
            'ค': 21,
            'ฅ': 22,
            'ฆ': 23,
            'ง': 24,
            'จ': 25,
            'ฉ': 26,
            'ช': 27,
            'ซ': 28,
            'ฌ': 29,
            'ญ': 30,
            'ฎ': 31,
            'ฏ': 32,
            'ฐ': 33,
            'ฑ': 34,
            'ฒ': 35,
            'ณ': 36,
            'ด': 37,
            'ต': 38,
            'ถ': 39,
            'ท': 40,
            'ธ': 41,
            'น': 42,
            'บ': 43,
            'ป': 44,
            'ผ': 45,
            'ฝ': 46,
            'พ': 47,
            'ฟ': 48,
            'ภ': 49,
            'ม': 50,
            'ย': 51,
            'ร': 52,
            'ฤ': 53,
            'ล': 54,
            'ฦ': 55,
            'ว': 56,
            'ศ': 57,
            'ษ': 58,
            'ส': 59,
            'ห': 60,
            'ฬ': 61,
            'อ': 62,
            'ฮ': 63,
            'ฯ': 64,
            'ะ': 65,
            'ั': 66,
            'า': 67,
            'ำ': 68,
            'ิ': 69,
            'ี': 70,
            'ึ': 71,
            'ื': 72,
            'ุ': 73,
            'ู': 74,
            'ฺ': 75,
            'เ': 76,
            'แ': 77,
            'โ': 78,
            'ใ': 79,
            'ไ': 80,
            'ๅ': 81,
            'ๆ': 82,
            '็': 83,
            '่': 84,
            '้': 85,
            '๊': 86,
            '๋': 87,
            '์': 88,
            'ํ': 89,
            '๙': 90
        }
        self.__target_token_index = {
            '\t': 0,
            '\n': 1,
            ' ': 2,
            '!': 3,
            '"': 4,
            '(': 5,
            ')': 6,
            '-': 7,
            '0': 8,
            '1': 9,
            '2': 10,
            '3': 11,
            '4': 12,
            '5': 13,
            '6': 14,
            '7': 15,
            '8': 16,
            '9': 17,
            'a': 18,
            'b': 19,
            'c': 20,
            'd': 21,
            'e': 22,
            'f': 23,
            'g': 24,
            'h': 25,
            'i': 26,
            'k': 27,
            'l': 28,
            'm': 29,
            'n': 30,
            'o': 31,
            'p': 32,
            'r': 33,
            's': 34,
            't': 35,
            'u': 36,
            'w': 37,
            'y': 38
        }
        self.__reverse_input_char_index = dict(
            (i, char) for char, i in self.__input_token_index.items())
        self.__reverse_target_char_index = dict(
            (i, char) for char, i in self.__target_token_index.items())
        self.__batch_size = 64
        self.__epochs = 100
        self.__latent_dim = 256
        self.__num_encoder_tokens = 91
        self.__num_decoder_tokens = 39
        self.__max_encoder_seq_length = 20
        self.__max_decoder_seq_length = 22

        # Restore the model and construct the encoder and decoder.
        self.__filemodel = get_corpus_path("thai2rom-v2")
        if not self.__filemodel:
            download("thai2rom-v2")
            self.__filemodel = get_corpus_path("thai2rom-v2")
        self.__model = load_model(self.__filemodel)
        self.__encoder_inputs = self.__model.input[0]  # input_1
        self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[
            2].output  # lstm_1
        self.__encoder_states = [self.__state_h_enc, self.__state_c_enc]
        self.__encoder_model = Model(self.__encoder_inputs,
                                     self.__encoder_states)
        self.__decoder_inputs = self.__model.input[1]  # input_2
        self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ),
                                             name="input_3")
        self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ),
                                             name="input_4")
        self.__decoder_states_inputs = [
            self.__decoder_state_input_h,
            self.__decoder_state_input_c,
        ]
        self.__decoder_lstm = self.__model.layers[3]
        self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm(
            self.__decoder_inputs, initial_state=self.__decoder_states_inputs)
        self.__decoder_states = [self.__state_h_dec, self.__state_c_dec]
        self.__decoder_dense = self.__model.layers[4]
        self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs)
        self.__decoder_model = Model(
            [self.__decoder_inputs] + self.__decoder_states_inputs,
            [self.__decoder_outputs] + self.__decoder_states,
        )
Exemplo n.º 23
0
 def __init__(self):
     self.thai2fit_wv = get_corpus_path('thai2fit_wv')
     self.load_w2v()
Exemplo n.º 24
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        self.__batch_size = 64
        self.__epochs = 100
        self.__latent_dim = 256
        self.__num_samples = 648241
        self.__data_path = get_corpus_path("thai2rom-dataset")
        if not self.__data_path:
            download("thai2rom-dataset")
            self.__data_path = get_corpus_path("thai2rom-dataset")

        self.__input_texts = []
        self.__target_texts = []
        self.__input_characters = set()
        self.__target_characters = set()

        with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh:
            self.__lines = self.__fh.read().split("\n")

        for line in self.__lines[:min(self.__num_samples,
                                      len(self.__lines) - 1)]:
            input_text, target_text = line.split("\t")
            if len(input_text) < 30 and len(target_text) < 90:
                target_text = "\t" + target_text + "\n"
                self.__input_texts.append(input_text)
                self.__target_texts.append(target_text)
                for char in input_text:
                    if char not in self.__input_characters:
                        self.__input_characters.add(char)
                for char in target_text:
                    if char not in self.__target_characters:
                        self.__target_characters.add(char)

        self.__input_characters = sorted(list(self.__input_characters))
        self.__target_characters = sorted(list(self.__target_characters))
        self.__num_encoder_tokens = len(self.__input_characters)
        self.__num_decoder_tokens = len(self.__target_characters)
        self.__max_encoder_seq_length = max(
            [len(text) for text in self.__input_texts])
        self.__max_decoder_seq_length = max(
            [len(text) for text in self.__target_texts])
        """print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)"""
        self.__input_token_index = dict([
            (char, i) for i, char in enumerate(self.__input_characters)
        ])
        self.__target_token_index = dict([
            (char, i) for i, char in enumerate(self.__target_characters)
        ])
        self.__encoder_input_data = np.zeros(
            (
                len(self.__input_texts),
                self.__max_encoder_seq_length,
                self.__num_encoder_tokens,
            ),
            dtype="float32",
        )
        for i, input_text in enumerate(self.__input_texts):
            for t, char in enumerate(input_text):
                self.__encoder_input_data[i, t,
                                          self.__input_token_index[char]] = 1.

        # Restore the model and construct the encoder and decoder.
        self.__filemodel = get_corpus_path("thai2rom")
        if not self.__filemodel:
            download("thai2rom")
            self.__filemodel = get_corpus_path("thai2rom")
        self.__model = load_model(self.__filemodel)
        self.__encoder_inputs = self.__model.input[0]  # input_1
        self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[
            2].output  # lstm_1
        self.__encoder_states = [self.__state_h_enc, self.__state_c_enc]
        self.__encoder_model = Model(self.__encoder_inputs,
                                     self.__encoder_states)
        self.__decoder_inputs = self.__model.input[1]  # input_2
        self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ),
                                             name="input_3")
        self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ),
                                             name="input_4")
        self.__decoder_states_inputs = [
            self.__decoder_state_input_h,
            self.__decoder_state_input_c,
        ]
        self.__decoder_lstm = self.__model.layers[3]
        self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm(
            self.__decoder_inputs, initial_state=self.__decoder_states_inputs)
        self.__decoder_states = [self.__state_h_dec, self.__state_c_dec]
        self.__decoder_dense = self.__model.layers[4]
        self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs)
        self.__decoder_model = Model(
            [self.__decoder_inputs] + self.__decoder_states_inputs,
            [self.__decoder_outputs] + self.__decoder_states,
        )

        self.__reverse_input_char_index = dict(
            (i, char) for char, i in self.__input_token_index.items())
        self.__reverse_target_char_index = dict(
            (i, char) for char, i in self.__target_token_index.items())
Exemplo n.º 25
0
        features["word.next_isspace"] = next_word.isspace()
        features["word.next_isdigit"] = next_word.isdigit()
        features["word.next_postag"] = next_pos
    else:
        features["EOS"] = True  # End of Sequence

    return features


def _extract_features(doc):
    return [_doc2features(doc, i) for i in range(len(doc))]


_CORPUS_NAME = "lst20-cls"
tagger = pycrfsuite.Tagger()
tagger.open(get_corpus_path(_CORPUS_NAME))


def segment(doc: List[str]) -> List[List[str]]:
    word_tags = pos_tag(doc, corpus="lst20")
    features = _extract_features(word_tags)
    word_markers = list(zip(doc, tagger.tag(features)))

    clauses = []
    temp = []
    len_doc = len(doc) - 1
    for i, word_marker in enumerate(word_markers):
        word, marker = word_marker
        if marker == "E_CLS" or i == len_doc:
            temp.append(word)
            clauses.append(temp)
Exemplo n.º 26
0
    rm_useless_newlines,
    rm_useless_spaces,
    spec_add_spaces,
    ungroup_emoji,
)
from pythainlp.util import reorder_vowels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_MODEL_NAME_LSTM = "wiki_lm_lstm"
_ITOS_NAME_LSTM = "wiki_itos_lstm"


# Pretrained model paths
THWIKI_LSTM = dict(
    wgts_fname=get_corpus_path(_MODEL_NAME_LSTM),
    itos_fname=get_corpus_path(_ITOS_NAME_LSTM),
)

# Preprocessing rules for Thai text
# dense features
pre_rules_th = [
    replace_rep_after,
    fix_html,
    reorder_vowels,
    spec_add_spaces,
    rm_useless_spaces,
    rm_useless_newlines,
    rm_brackets,
    replace_url,
]
 def __init__(self):
     """
     Thai named-entity recognizer.
     """
     self.crf = CRFTagger()
     self.crf.open(get_corpus_path(_CORPUS_NAME))
Exemplo n.º 28
0
def _download() -> str:
    path = get_corpus_path("thai2fit_wv")
    if not path:
        download_data("thai2fit_wv")
        path = get_corpus_path("thai2fit_wv")
    return path
Exemplo n.º 29
0
def _lst20_tagger():
    global _LST20_TAGGER
    if not _LST20_TAGGER:
        _LST20_TAGGER = PerceptronTagger(
            path=get_corpus_path(_LST20_TAGGER_NAME, version="0.2.3"))
    return _LST20_TAGGER
Exemplo n.º 30
0
from typing import List
from symspellpy import SymSpell, Verbosity
from pythainlp.corpus import get_corpus_path
from pythainlp.corpus import path_pythainlp_corpus
from pythainlp.tokenize import word_tokenize

_UNIGRAM = "tnc_freq.txt"
_BIGRAM = "tnc_bigram_word_freqs"

sym_spell = SymSpell()
sym_spell.load_dictionary(path_pythainlp_corpus(_UNIGRAM),
                          0,
                          1,
                          separator='\t',
                          encoding="utf-8-sig")
sym_spell.load_bigram_dictionary(get_corpus_path(_BIGRAM),
                                 0,
                                 2,
                                 separator='\t',
                                 encoding="utf-8-sig")


def spell(text: str, max_edit_distance: int = 2) -> List[str]:
    return [
        str(i).split(',')[0] for i in list(
            sym_spell.lookup(
                text, Verbosity.CLOSEST, max_edit_distance=max_edit_distance))
    ]


def correct(text: str, max_edit_distance: int = 1) -> str: