def __func_tokenizer(text:str, tokenizer_obj:MecabWrapper, pos_condition:List[Tuple[str,...]]=None, is_surface:bool=False)->List[str]: """* What you can do - This is base function tokenizer. - You use this function with functools.partial """ if pos_condition is None: return tokenizer_obj.tokenize(sentence=text, is_surface=is_surface).convert_list_object() else: return tokenizer_obj.tokenize(sentence=text, is_surface=is_surface).filter(pos_condition).convert_list_object()
def basic_example(): # ======================================================== # TOKENIZE # ======================================================== # input is `unicode` type(in python2x) # In python3x, you don't mind it sentence = u'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # osType is generic or centos. it's because Mecab has different system command in CentOs. # If you're using this in CentsOs, put "centos" osType = "generic" # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "" mecab_wrapper = MecabWrapper(dictType=dictType, osType=osType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = [u'テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, stopwords=stopwords ) assert isinstance(filtered_obj, FilteredObject) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')] filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, pos_condition=pos_condition ) assert isinstance(filtered_obj, FilteredObject)
def basic_example_3x(): # ======================================================== # TOKENIZE # ======================================================== # In python3x, you don't mind it sentence = 'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # path where `mecab-config` command exists. You can check it with `which mecab-config` # default value is '/usr/local/bin' path_mecab_config='/usr/local/bin' # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "" mecab_wrapper = MecabWrapper(dictType=dictType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = ['テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, stopwords=stopwords ) assert isinstance(filtered_obj, FilteredObject) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')] filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, pos_condition=pos_condition ) assert isinstance(filtered_obj, FilteredObject)
def advanced_example_3x(): # ======================================================== # USE YOUE OWN DICTIONARY # with your own dictionary, you can force Mecab to make some word into one token # ======================================================== # make your own "user dictionary" with CSV file # To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html example_user_dict = "userdict.csv" # set dictType='user' or dictType='all' # set pathUserDictCsv mecab_wrapper = MecabWrapper(dictType='user', pathUserDictCsv=example_user_dict) sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' tokenized_obj = mecab_wrapper.tokenize(sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) for token_obj in tokenized_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) if token_obj.word_stem == 'ペルシア語': print(token_obj.word_stem)
def basic_example_3x(): # ======================================================== # TOKENIZE # ======================================================== # In python3x, you don't mind it sentence = 'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # path where `mecab-config` command exists. You can check it with `which mecab-config` # default value is '/usr/local/bin' path_mecab_config = '/usr/local/bin' # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "" mecab_wrapper = MecabWrapper(dictType=dictType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = ['テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, stopwords=stopwords) assert isinstance(filtered_obj, FilteredObject) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')] filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, pos_condition=pos_condition) assert isinstance(filtered_obj, FilteredObject) ### You can write chain expression on init-instance -> tokenize -> filtering -> list ### filtered_result = MecabWrapper( dictType=dictType, path_mecab_config=path_mecab_config).tokenize(sentence).filter( pos_condition=pos_condition).convert_list_object() assert isinstance(filtered_result, list) print(filtered_result)
def advanced_example_3x(): # ======================================================== # USE YOUE OWN DICTIONARY # with your own dictionary, you can force Mecab to make some word into one token # ======================================================== # make your own "user dictionary" with CSV file # To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html example_user_dict = "userdict.csv" # set dictType='user' or dictType='all' # set pathUserDictCsv mecab_wrapper = MecabWrapper( dictType='user', pathUserDictCsv=example_user_dict ) sentence = 'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' tokenized_obj = mecab_wrapper.tokenize(sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) for token_obj in tokenized_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) if token_obj.word_stem == 'ペルシア語': print(token_obj.word_stem)
class MecabTokenizer: def __init__(self, dict_type=None): if dict_type: self.tokenizer = MecabWrapper(dictType=dict_type) else: self.tokenizer = MecabWrapper(dictType=None) def tokenize(self, text): tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects return [ dict(analyzed_line=obj.analyzed_line, word_surface=obj.word_surface, word_stem=obj.word_stem, pos=list(obj.tuple_pos), misc_info=obj.misc_info) for obj in tokenized_objects ]
def __init__(self, dict_type=None): if dict_type: self.tokenizer = MecabWrapper(dictType=dict_type) else: self.tokenizer = MecabWrapper(dictType=None)
def basic_example_mecab_2x(): # ======================================================== # TOKENIZE # ======================================================== # input is `unicode` type(in python2x) sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # you can choose from "neologd", "all", "ipadic", "user", "", None # "ipadic" and "" is equivalent dictType = "neologd" mecab_wrapper = MecabWrapper(dictType=dictType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = [u'テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, stopwords=stopwords) assert isinstance(filtered_obj, FilteredObject) # print('-' * 30) print(u'Mecab Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos)) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')] filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, pos_condition=pos_condition) assert isinstance(filtered_obj, FilteredObject) print('-' * 30) print(u'Mecab Filtering Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos)) ### You can write chain expression on init-instance -> tokenize -> filtering -> list ### filtered_result = MecabWrapper( dictType=dictType).tokenize(sentence).filter( pos_condition=pos_condition).convert_list_object() assert isinstance(filtered_result, list) print(filtered_result)
"""In this example, you see how to get wikipedia-liked information from Japanese sentence """ # ------------------------------------------------------------ # PARAMETERS path_model_file = '../bin/entity_vector/entity_vector.model.bin' dict_type = 'neologd' path_mecab_config = '/usr/local/bin/' pos_condition = [('名詞', )] mysql_username = '******' mysql_hostname = 'localhost' mysql_password = '******' mysql_db_name = 'wikipedia' # ------------------------------------------------------------ entity_linking_model = load_entity_model(path_model_file) mecab_tokenizer = MecabWrapper(dict_type, path_mecab_config=path_mecab_config) model_object = load_entity_model(path_entity_model=path_model_file, is_use_cache=True) # type: Word2Vec mysql_connector = initialize_pymysql_connector(hostname=mysql_hostname, user_name=mysql_username, password=mysql_password, dbname=mysql_db_name) input_sentence = "かつてはイルモア、WCMといったプライベーターがオリジナルマシンで参戦していたほか、カワサキがワークス・チームを送り込んでいたが、2016年現在出場しているのはヤマハ、ホンダ、スズキ、ドゥカティ、アプリリアの5メーカーと、ワークスマシンの貸与等を受けられるサテライトチームとなっている。" filtered_nouns = mecab_tokenizer.filter( parsed_sentence=mecab_tokenizer.tokenize(sentence=input_sentence,return_list=False), pos_condition=pos_condition).convert_list_object() sequence_score_ojects = predict_japanese_wiki_names_with_wikidump(input_tokens=filtered_nouns, wikipedia_db_connector=mysql_connector, entity_vector_model=entity_linking_model, is_use_cache=True,
PATH_TRAINING_TEXT = './wikipedia_data/wikipedia-full.json' PATH_TEST_TEXT = './wikipedia_data/wikipedia-evaluation-full.json' PATH_ENTITY_VECTOR = './entity_vector/entity_vector.model.bin' PATH_SAVE_TARINED_MODEL = './trained_auto_encoder.h5' POS_CONDITION = [('名詞',), ('動詞', '自立'), ('形容詞', '自立'), ('副詞',), ('助動詞',), ('連体詞',)] ## check file existing ## if not os.path.exists(PATH_TRAINING_TEXT): raise FileExistsError() if not os.path.exists(PATH_ENTITY_VECTOR): raise FileExistsError() ## initialize tokenizer funtion ## tokenizer_obj = MecabWrapper(dictType='neologd') get_token = partial(__func_tokenizer, tokenizer_obj=tokenizer_obj, pos_condition=POS_CONDITION, is_surface=False) ## load word embedding ## try: embedding_model = KeyedVectors.load_word2vec_format(PATH_ENTITY_VECTOR, **{'binary': True, 'unicode_errors': 'ignore'}) except: embedding_model = Word2Vec.load_word2vec_format(PATH_ENTITY_VECTOR, **{'binary': True, 'unicode_errors': 'ignore'}) ## make training data ##
def basic_example_mecab_2x(): # ======================================================== # TOKENIZE # ======================================================== # input is `unicode` type(in python2x) sentence = u'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # path where `mecab-config` command exists. You can check it with `which mecab-config` # default value is '/usr/local/bin' path_mecab_config='/usr/local/bin' # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "neologd" mecab_wrapper = MecabWrapper(dictType=dictType, path_mecab_config=path_mecab_config) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = [u'テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, stopwords=stopwords ) assert isinstance(filtered_obj, FilteredObject) # print('-'*30) print(u'Mecab Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos)) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')] filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, pos_condition=pos_condition ) assert isinstance(filtered_obj, FilteredObject) print('-'*30) print(u'Mecab Filtering Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos))