Exemplo n.º 1
0
    def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False, dotfile=''):
        """
        Initialize Tokenizer object with optional arguments.

        :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data
        :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8'
        :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic'
        :param max_unknows_length: (Optional) max unknown word length. default is 1024.
        :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode.
        :param mmap: (Optional) if given True use memory-mapped file for dictionary data.

        .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary.
        """
        self.wakati = wakati
        if mmap:
            self.sys_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA)
        else:
            self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati), connections, chardef.DATA, unknowns.DATA)
        if udic:
            if udic.endswith('.csv'):
                # build user dictionary from CSV
                self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections)
            elif os.path.isdir(udic):
                # load compiled user dictionary
                self.user_dic = CompiledUserDictionary(udic, connections)
            else:
                self.user_dic = None
        else:
            self.user_dic = None
        self.max_unknown_length = max_unknown_length
Exemplo n.º 2
0
    def test_property_types(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                   chardef.DATA, unknowns.DATA)
        # entry in the system dictionary
        entry = sys_dic.lookup('すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = sys_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # unknown entry
        entry = sys_dic.unknowns.get(u'HIRAGANA')[0]
        self.assertTrue(type(entry[3]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)

        # mmap dict etnry
        mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(),
                                        connections, chardef.DATA,
                                        unknowns.DATA)
        entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = mmap_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # entry in the user defined dictionary
        user_dic = UserDictionary(user_dict=os.path.join(
            parent_dir, 'tests/user_ipadic.csv'),
                                  enc='utf8',
                                  type='ipadic',
                                  connections=connections)
        entry = user_dic.lookup('東京スカイツリー'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)
Exemplo n.º 3
0
    def __init__(self,
                 udic: str = '',
                 *,
                 udic_enc: str = 'utf8',
                 udic_type: str = 'ipadic',
                 max_unknown_length: int = 1024,
                 wakati: bool = False,
                 mmap: bool = DEFAULT_MMAP_MODE,
                 dotfile: str = ''):
        """
        Initialize Tokenizer object with optional arguments.

        :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data
        :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8'
        :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'.
                          default is 'ipadic'
        :param max_unknows_length: (Optional) max unknown word length. default is 1024.
        :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode.
        :param mmap: (Optional) if given False, memory-mapped file mode is disabled.
                     Set this option to False on any environments that do not support mmap.
                     Default is True on 64bit architecture; otherwise False.

        .. seealso:: http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary
        """
        self.sys_dic: Union[SystemDictionary, MMapSystemDictionary]
        self.user_dic: Optional[Union[UserDictionary, CompiledUserDictionary]]
        self.wakati = wakati
        if mmap:
            self.sys_dic = MMapSystemDictionary(all_fstdata(),
                                                mmap_entries(wakati),
                                                connections, chardef.DATA,
                                                unknowns.DATA)
        else:
            self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati),
                                            connections, chardef.DATA,
                                            unknowns.DATA)
        if udic:
            if udic.endswith('.csv'):
                # build user dictionary from CSV
                self.user_dic = UserDictionary(udic, udic_enc, udic_type,
                                               connections)
            elif os.path.isdir(udic):
                # load compiled user dictionary
                self.user_dic = CompiledUserDictionary(udic, connections)
            else:
                self.user_dic = None
        else:
            self.user_dic = None
        self.max_unknown_length = max_unknown_length
Exemplo n.º 4
0
 def test_system_dictionary_ipadic(self):
     sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                chardef.DATA, unknowns.DATA)
     self.assertEqual(7, len(sys_dic.lookup('形態素'.encode('utf-8'))))
     self.assertEqual(1, sys_dic.get_trans_cost(0, 1))
     self.assertEqual({'HIRAGANA': []}, sys_dic.get_char_categories('は'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories('ハ'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories('ハ'))
     self.assertEqual({'KANJI': []}, sys_dic.get_char_categories('葉'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories('C'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories('C'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories('#'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories('#'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories('5'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories('5'))
     self.assertEqual({
         'KANJI': [],
         'KANJINUMERIC': ['KANJI']
     }, sys_dic.get_char_categories('五'))
     self.assertEqual({'GREEK': []}, sys_dic.get_char_categories('Γ'))
     self.assertEqual({'CYRILLIC': []}, sys_dic.get_char_categories('Б'))
     self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories('𠮷'))
     self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories('한'))
     self.assertTrue(sys_dic.unknown_invoked_always('ALPHA'))
     self.assertFalse(sys_dic.unknown_invoked_always('KANJI'))
     self.assertTrue(sys_dic.unknown_grouping('NUMERIC'))
     self.assertFalse(sys_dic.unknown_grouping('KANJI'))
     self.assertEqual(2, sys_dic.unknown_length('HIRAGANA'))
Exemplo n.º 5
0
    def test_system_dictionary_cache(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA)
        self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8'))))
        self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8'))))
        self.assertEqual(11, len(sys_dic.lookup(u'小書きにしました'.encode('utf8'))))

        self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8'))))
        self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8'))))

        self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8'))))
        self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8'))))
Exemplo n.º 6
0
def generate_abc_dic(
    sysdic: typing.Optional[typing.Iterable[typing.Iterable[typing.Any]]] = None
) -> typing.Iterator[JanomeLexEntry]:
    """
    Generate custom Janome lexical entries for this parser.
    Parameters
    ----------
    sysdic : internal list of lexical entries in janome.dic.SystemDictionary, optional
        An iterable of internal representation of Janome lexical entries.
        Optional.
        If not given, this function will retrive one from Janome.
        Giving a reference to the system lexical entries is recommended 
            for performance reasons
            whenever you have obtained a relevant instance 
            which contains a Janome system dictionary.
    Returns
    -------
    abc_entries : set of JanomeLexEntry
        Our custom lexical entries.
    Notes
    -----
    The authors choose a set, rather than a generator, for the returning result
        since this subroutine is intended to be externally cached (not implemented yet).
    Examples
    --------
    >>> import janome.tokenizer as jt
    ... tokenizer = jt.Tokenizer()
    ... abc_entries = dic.generate_abc_dic(
    ...     sysdic = tokenizer.sys_dic.entries.values()
    ... )
    ... next(iter(abc_entries)).surface
    "筈もあれ"
    """

    if sysdic:
        return _gen_abc_dic(sysdic)
    else:
        import janome.dic
        from janome.sysdic import (
            all_fstdata, entries, mmap_entries, 
            connections, chardef, unknowns
        )

        janome_sys_dic = janome.dic.SystemDictionary(
            all_fstdata(), 
            entries(None), 
            connections, 
            chardef.DATA, 
            unknowns.DATA
        )

        return set(_gen_abc_dic(janome_sys_dic.entries.values()))
Exemplo n.º 7
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import unittest
from janome.sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns
from janome.dic import SystemDictionary, MMapSystemDictionary
from janome.lattice import Lattice, BOS, EOS, SurfaceNode

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA,
                           unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections,
                                    chardef.DATA, unknowns.DATA)


class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(8, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = 'すもも'