def tokenize(self, sentence): search_tags = [] t = Tokenizer() self.get_logger().info("\n[tokens]") for token in t.tokenize(sentence): self.get_logger().info(str(token)) self.get_logger().info("\n[search_tags]") tokens = t.tokenize(sentence) i = 0 for token in tokens: if token.part_of_speech.split(',')[0] == u'動詞': search_tags.append(token.base_form) elif token.part_of_speech.split(',')[0] == u'名詞' and \ token.part_of_speech.split(',')[1] == u'接尾': search_tags.append(tokens[i - 1].surface + token.surface) elif token.part_of_speech.split(',')[0] == u'名詞': if token.base_form != '*': search_tags.append(token.base_form) else: search_tags.append(token.surface) i += 1 search_tags.append(sentence) for search_tag in search_tags: self.get_logger().info(str(search_tag)) return search_tags
class JanomeParserV2: def __init__(self, dic_path=None, base_form=False): if dic_path: self.t = Tokenizer(dic_path, udic_enc="cp932") else: self.t = Tokenizer() self.base_form_key = base_form def parser(self, value_str, tag=u"名詞"): res = self.t.tokenize(value_str) if self.base_form_key: if isinstance(tag, list): return [token.base_form for token in res if token.part_of_speech.split(",")[0] in tag] else: return [token.base_form for token in res if token.part_of_speech.split(",")[0] == tag] else: if isinstance(tag, list): return [token.surface for token in res if token.part_of_speech.split(",")[0] in tag] else: return [token.surface for token in res if token.part_of_speech.split(",")[0] == tag] def tokens(self, value_str): res = self.t.tokenize(value_str) return [{token.surface: token.part_of_speech} for token in res if token.part_of_speech.split(",")[0]] def wakati(self, value_str): return self.t.tokenize(value_str, wakati=True)
def FaaS_janome(url="",fields={}): ret="" if url=="":#fallback t = Tokenizer()#'./neologd' if 'speech' in fields: target=fields['speech'].translate(str.maketrans("\"\'\\/<>%`?;",'””¥_〈〉%”?;')) target=target.translate(str.maketrans(", ",'__')) for token in t.tokenize(target): ret+=token.part_of_speech.split(',')[0]+"," del t;gc.collect();return ret.strip(',') if 'surface' in fields: target=fields['surface'].translate(str.maketrans("\"\'\\/<>%`?;",'””¥_〈〉%”?;')) target=target.translate(str.maketrans(", ",'__')) for token in t.tokenize(target): ret+=token.surface+"," del t;gc.collect();return ret.strip(',') if 'phonetic' in fields: target=fields['phonetic'].translate(str.maketrans("\"\'\\/<>%`?;",'””¥_〈〉%”?;')) target=target.translate(str.maketrans(", ",'__')) for token in t.tokenize(target): ret+=token.phonetic+"," del t;gc.collect();return ret.strip(',') https = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where(),headers={}) try:html=https.request('POST',url, body=json.dumps(fields),headers={'Content-Type': 'application/json'}) except: return "ERROR:invalid endpoint" return html.data.decode('utf-8').translate(str.maketrans("","","\"\'\\/<>%`?;"))#Not_secure_filename!
def conv_str_to_kana(str_list, answer_list): #ファイルから読み込んだものをJanomeを使ってカタカナに変換 t = Tokenizer() kana_list = [] kana_ans_list = [] for i in str_list: kana = "" for token in t.tokenize(i): if token.reading == "*": kana = kana + (token.base_form) else: kana = kana + (token.reading) kana_list.append(kana) for j in answer_list: kana_ans = "" for token in t.tokenize(j): if token.reading == "*": kana_ans = kana_ans + (token.base_form) else: kana_ans = kana_ans + (token.reading) kana_ans_list.append(kana_ans) return kana_list, kana_ans_list
def run(self, force=None): print('start') # 全サイト取得と重複排除 sites = {} for site in Site.get_all(): sites[site.url] = site # リストに対してignoreとkeywordマッチを排除 sure = [] for key in sites: site = sites[key] response = requests.get(site.subjects_url) assert (response.status_code == 200), response.text # parse data = list(response.text.split('\n')) for line in data: try: _ = Subject(site, line) sure.append(_) except: pass print(sure) # リスト出力 t = Tokenizer() r = defaultdict(int) r2 = defaultdict(list) r3 = defaultdict(int) for _sure in sure: try: for token in t.tokenize(_sure.title): if not token_filter(token): r[token.surface] += 1 r2[token.surface] += [_sure] r3[token] += 0 except: pass # sort sure = sorted(sure, key=lambda x: x.title) for _sure in sure: try: point = 0 for token in t.tokenize(_sure.title): if not token_filter(token): point += r[token.surface] if not filter_title(point, _sure): print(_sure.title, _sure.count_res) except: pass
def cntjp(): m = Frame.m txt = m.get() root23 = tk.Tk() root23.title('Result(CounterJP)') t = Tokenizer() for token in t.tokenize(txt): print(token) c = collections.Counter(t.tokenize(txt, wakati=True)) label23 = tk.Label(root23, text=c, font=16) label23.pack(fill="x") root23.mainloop()
class JanomeTokenizer(Tokenizer): def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False): self.udic = udic self.udic_enc = udic_enc self.udic_type = udic_type self.max_unknown_length = max_unknown_length self.wakati = wakati self.mmap = mmap self.tagger = Janome(udic=self.udic, udic_enc=self.udic_enc, udic_type=self.udic_type, max_unknown_length=self.max_unknown_length, wakati=self.wakati, mmap=self.mmap) def __getstate__(self): return {k: v for k, v in self.__dict__.items() if k != "tagger"} def __setstate__(self, state): self.__dict__.update(state) self.tagger = Janome(udic=self.udic, udic_enc=self.udic_enc, udic_type=self.udic_type, max_unknown_length=self.max_unknown_length, wakati=self.wakati, mmap=self.mmap) self.tagger.tokenize('') def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), '%s is not unicode' % repr(value) token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: token.original = token.text = value token.boost = 1.0 if positions: token.pos = start_pos if chars: token.startchar = start_char token.endchar = start_char + len(value) yield token else: pos = start_pos for janome_token in self.tagger.tokenize(value): token.text = janome_token.surface token.boost = 1.0 if keeporiginal: token.original = token.text token.stopped = False if positions: token.pos = pos pos += 1 if chars: token.startchar = start_char + janome_token.start token.endchar = token.startchar + len(janome_token.surface) yield token
def wakati(text: str): """テキストを分かち書きにして頻出単語を求める """ t = Tokenizer() # 単語頻度 c = collections.Counter(t.tokenize(text, wakati=True)) print(*c.most_common()[:15], sep='\n') print() # 特定の品詞のみ c = collections.Counter(token.base_form for token in t.tokenize(text) if token.part_of_speech.startswith('名詞,固有名詞')) # 降順で表示 print(*c.most_common()[:15], sep='\n')
def generate_tweet(sentense): tokenizer = Tokenizer() # print(sentense) word_list = tokenizer.tokenize(sentense, wakati=True) part_list = [ token.part_of_speech.split(',')[0] for token in tokenizer.tokenize(sentense) ] base_word_list = [ token.base_form for token in tokenizer.tokenize(sentense) ] try: del word_list[word_list.index('#'):len(word_list)] del base_word_list[base_word_list.index('#'):len(base_word_list)] except: pass try: del word_list[word_list.index('https'):len(word_list)] del base_word_list[base_word_list.index('https'):len(base_word_list)] except: pass # print(word_list) # print(part_list) # print(base_word_list) part_cnt = 0 cnt = 0 gen_sentense = '' for (word, part) in zip(word_list, part_list): if part == '名詞': part_cnt += 1 gen_sentense += word if part_cnt == part_list.count('名詞'): # print("cnt :{}".format(cnt)) # print("word_list :{}".format(word_list)) for i in range(cnt, len(word_list)): # print("i :{}".format(i)) if part_list[i] == ('動詞') or part_list[i] == ('形容詞'): for j in range(cnt + 1, i): # print("j :{}".format(j)) # print("word_list[j] :{}".format(word_list[j])) gen_sentense += word_list[j] gen_sentense += (base_word_list[i] + "の") break break cnt += 1 gen_sentense += "って普通だよね……?" if random.random() > 0.1 else "って普通じゃなかったんだ……" # print(gen_sentense) return gen_sentense
def _morphological_analysis(sentence): u""" 一文を形態素解析する @param sentence 一文 @return 形態素で分割された配列 """ morphemes = [] t = Tokenizer() node = t.tokenize(sentence) result = [] for token in t.tokenize(sentence): # print(token.surface) result.append(token.surface) return result
class MainTranslator(object): def __init__ (self): self.janome= Tokenizer() def get_gobi(self, n): f = n.part_of_speech.split(',') if n.surface in ['だ','です','た','だろ','ある']: if f[0] == '助動詞': return 'ハゲ' if n.surface in ['無い','ない','ぬ']: if f[0] == '助動詞': return 'ぬハゲ' if f[0] == '形容詞': return 'なしハゲ' def Translator(self, text): tokens = self.janome.tokenize(text) text = '' for n in tokens: f = n.part_of_speech.split(',') if n.surface in converter: text += converter[n.surface] elif len(f) > 3: gobi = self.get_gobi(n) if gobi is not None: text += gobi else: text += n.surface else: text += n.surface return text
def tokenize(text): t = Tokenizer() # テキストの先頭にあるヘッダとフッタを削除 text = re.split(r'\-{5,}',text)[2] text = re.split(r'底本:', text)[0] text = text.strip() # ルビを削除 text = text.replace('|', '') text = re.sub(r'《.+?》', '', text) # テキスト内の脚注を削除 text = re.sub(r'[#.+?]', '', text) # 一行ずつ処理 lines = text.split("\r\n") results = [] for line in lines: res = [] tokens = t.tokenize(line) for tok in tokens: bf = tok.base_form # 基本系 if bf == "*": bf = tok.surface ps = tok.part_of_speech # 品詞情報 hinsi = ps.split(',')[0] if hinsi in ['名詞', '動詞', '形容詞', '記号']: res.append(bf) l = " ".join(res) results.append(l) return results
def separatewords(text): separatedWord=[] t=Tokenizer() tokens=t.tokenize(unicode(text, "utf-8")) for token in tokens: posList=token.part_of_speech.split(",") pos1=posList[0] if isinstance(pos1, unicode): pos1=pos1.encode("utf-8") pos2=posList[1] if isinstance(pos2, unicode): pos2=pos2.encode("utf-8") ruby=token.reading if isinstance(ruby, unicode): ruby=ruby.encode("utf-8") if pos1=="名詞": if pos2!="接尾" and pos2!="代名詞" and pos2!="非自立" and pos2!="数" and pos2!="形容動詞語幹": if ruby!="*": separatedWord.append(token.surface.lower()) print token.surface.lower() elif pos2!="サ変接続" and len(token.surface)>3: # 英単語に関しては4文字以上の単語を扱う separatedWord.append(token.surface.lower()) print token.surface.lower() return separatedWord
def generateToken(df): df_token = pd.DataFrame(index=[], columns=[ 'meeting_no', 'meeting_date', 'monitoring_index', 'line_number', 'token', 'part_of_speech', 'part_of_speech2', 'part_of_speech3', 'part_of_speech4', 'infl_type', 'base_form' ]) for row in df.itertuples(name=None): meeting_no = row[1] meeting_date = row[2] monitoring_index = row[3] line_number = row[4] t = Tokenizer() tokens = t.tokenize(row[5]) for token in tokens: if not re.search(r'[、。I,%%~~##※\\\(\)\.\-\/]', token.surface) and token.surface not in [ 'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ' ]: word_category = token.part_of_speech.split(',')[0] word_type = token.part_of_speech.split(',')[1] if word_category == '名詞' and word_type != '数' and word_type != '代名詞' and word_type != '非自立' and word_type != '接尾': df_token.loc[len(df_token.index)] = [ meeting_no, meeting_date, monitoring_index, line_number, token.surface ] + token.part_of_speech.split(',') + [ token.infl_type, token.base_form ] df_token = df_token.replace('*', np.nan) return df_token
def tokenize(self, text): tokens = [] t = Tokenizer() pre_tokens = t.tokenize(text) for token in pre_tokens: tokens.append(token.surface) return tokens
def get_token(text): t = Tokenizer() tokens = t.tokenize(text, wakati=True) # 分かち書きする word = "" for token in tokens: word += token + " " return word
class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome " "library: https://github.com/mocobeta/janome") self.tokenizer = Tokenizer() def __call__(self, text): words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization (see #1557) def to_bytes(self, **exclude): return b'' def from_bytes(self, bytes_data, **exclude): return self def to_disk(self, path, **exclude): return None def from_disk(self, path, **exclude): return self
def main(): # コマンドラインからファイル名を取得 (csvのみ) FILENAME = sys.argv[1] # 前処理するデータの読み込み df = pd.read_csv('dataset/original/' + FILENAME) t = Tokenizer() wakati = [] # 分かち書きしたタイトルを入れるための変数 length = [] # 1タイトルに含まれるキーワードの数を入れる # df内のデータを全て分かち書きする for i in range(len(df)): tmp = df['title'][i] tmp = [ token.surface for token in t.tokenize(tmp) if ((token.part_of_speech.split(',')[0] in [ '形容詞', '副詞', '名詞', ]) or ((token.part_of_speech.split(',')[0] in ['動詞']) & (token.part_of_speech.split(',')[1] not in ['接尾']))) ] length.append(len(tmp)) wakati.append(tmp) df_wakati = pd.DataFrame({"title": wakati, "len": length}) df_wakati.to_csv("dataset/preprocessed/pre_" + FILENAME)
def annotate_using_janome(sentences, tokenize=False): assert tokenize, 'no support for using janome with pre-tokenized inputs' try: from janome.tokenizer import Tokenizer except ImportError: logger.error( 'failed to import janome. please install it by "pip install janome".' ) exit(1) logger.info('use Janome to tokenize and annotate POS infos.') tokenizer = Tokenizer() res = [] raw_sentences = [] for sentence in sentences: sentence = ''.join(sentence) tokenized = list(tokenizer.tokenize(sentence)) tokens = [] for token in tokenized: pos, pos1, pos2, pos3 = token.part_of_speech.split(',') token = Token(word=token.surface, surf=token.surface, pos=pos, pos1=pos1, pos2=pos2, pos3=pos3, inflectionForm=token.infl_form, inflectionType=token.infl_type, reading=token.reading, base=token.base_form) tokens.append(token) raw_sentence = [token.surface for token in tokenized] res.append(tokens) raw_sentences.append(raw_sentence) return res, raw_sentences
def conv_str_to_kana(str_list): #ファイルから読み込んだものをJanomeを使ってカタカナに変換 """ str_list=タイトル ファイルから読み込んだものをJanomeを使ってカタカナに変換する Args: str_list str: Returns: [str]: kana_list=カタカナ """ t = Tokenizer() kana_list = [] for i in str_list: kana = "" for token in t.tokenize(i): if token.reading == "*": kana = kana + (token.base_form) else: kana = kana + (token.reading) kana_list.append(kana) return kana_list
def chunk_with_kanji(istr): t = Tokenizer() tokens = t.tokenize(istr) # give each element flags (jiritsu or fuzoku) flags = [judge_jifu(x.part_of_speech) for x in tokens] surface = [x.surface for x in tokens] # split to chunks, delimited by KUGIRI flag # very ugly. should be rewritten using tree structure etc. cflags = insert_chunkflg(flags) rstr = u"" i = 0 for j, f in enumerate(flags): if i >= len(cflags): break if cflags[i] == KUGIRI: if f == KUTOU: rstr += surface[j] i += 1 else: rstr += u" " rstr += surface[j] i += 2 else: rstr += surface[j] i += 1 # don't know why this is necessary if flags != [] and j == 0 and len(surface) != 1: while j < len(surface): rstr += surface[j] j += 1 return rstr
def text_tokenize(from_file, to_file, encoding='utf-8'): """ :param from_file: :param to_file: :param encoding: :return: """ _original = from_text(from_file, mode='r', encoding=encoding) # remove useless characters _process = re.sub("《[^》]+》", "", _original) # ルビの削除 _process = re.sub("[[^]]+]", "", _process) # 読みの注意の削除 _process = re.sub("[| 「」\n]", "", _process) # | と全角半角スペース、「」と改行の削除 # split in sentences seperator = "。" # 。をセパレータに指定 _process_list = _process.split(seperator) # セパレーターを使って文章をリストに分割する _process_list.pop( ) # 最後の要素は空の文字列になるので、削除 / erase the last element of the list because he's empty processed_list = [x + seperator for x in _process_list] # 文章の最後に。を追加 # Janome tokenize t = Tokenizer() processed_words = [] for sentence in processed_list: processed_words.append(t.tokenize(sentence, wakati=True)) # 文章ごとに単語に分割し、リストに格納 save_pickle(to_file, processed_words)
def get_words(text, customdict=None): """ 与えられたテキストを形態素解析して、含まれる名詞のリストを返す """ def _filter(s): """ 名詞だけにフィルタリングする """ reg = re.compile(r'名詞') ignore_reg = re.compile(r'非自立') if (reg.search(s.part_of_speech) and not ignore_reg.search(s.part_of_speech)): return True if customdict: t = Tokenizer(customdict) else: t = DEFAULT_TOKENIZER __word = '' __results = [] __text = re.sub(r'[()!?.,]+', ' ', unicodedata.normalize('NFKC', text)) for s in t.tokenize(__text): if _filter(s): __word += s.surface elif __word: __results.append(__word) __word = '' if __word: __results.append(__word) return __results
def parse(text): t = Tokenizer() tokens = t.tokenize(text) result = [] for token in tokens: result.append(token.surface) return (result)
def default_func(message): #f = open("plugins_difficult/polarity.yml", "r+") #polarity = yaml.load(f) text = message.body['text'] # メッセージを取り出す # 送信メッセージを作る。改行やトリプルバッククォートで囲む表現も可能 t = Tokenizer() #m = MeCab.Tagger ("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd") tokens = t.tokenize(text) #msg = 'あなたの送ったメッセージをmecabで解析します。\n```' + m.parse(text) + '```' pol_val = 0 for token in tokens: word = token.surface #品詞を取得 pos = token.part_of_speech.split(',')[0] if word in polarity: pol_val = pol_val + float(polarity[word]) #print('{0} , {1}'.format(word, pos)) #次の単語に進める message.reply("```Sentence you input is " + text + ". Sentence polarity is " + str(pol_val) + "```") # メンション #message.send("Sentence tag is"+','.join(tags)+"```") if pol_val > 0.2: message.react('+1') message.reply("それはいいね!!") elif pol_val < -0.2: message.react('cry') message.reply("そうか、どんまい") else: message.reply("なるほど、そうなんですね〜")
def split_for_markovify(text): """split text to sentences by newline, and split sentence to words by space. """ # separate words using janome tagger = Tokenizer() splitted_text = "" # these chars might break markovify # https://github.com/jsvine/markovify/issues/84 breaking_chars = [ '(', ')', '[', ']', '"', "'", ] # split whole text to sentences by newline, and split sentence to words by space. for line in text.split(): for token in tagger.tokenize(line): try: if token.surface not in breaking_chars: splitted_text += token.surface # skip if node is markovify breaking char if token.surface != '。' and token.surface != '、': splitted_text += ' ' # split words by space if token.surface == '。': splitted_text += '\n' # reresent sentence by newline except UnicodeDecodeError as e: # sometimes error occurs print(line) return splitted_text
def get_wakati_ja(s): """ ==usagge== df["ja_new_text"] = df["ja_text"].apply(get_wakati_ja) """ tknz = Tokenizer() UNICODE_EMOJI = [] for lang, value in emoji.UNICODE_EMOJI.items(): UNICODE_EMOJI += list(value.keys()) UNICODE_EMOJI = list(set(UNICODE_EMOJI)) l = [] speech = ["名詞", "動詞", "形容詞"] code_regex = re.compile( '[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】&*・()$#@。、?!`+¥%]') s = code_regex.sub(' ', s) for token in tknz.tokenize(s): if token.part_of_speech.split(",")[0] in speech: t = token.base_form t = t.replace(' ', '') if len(t) == 0: continue if len(t) == 1: if len(re.findall('[\u3041-\u309F]+', t)) > 0: continue if t in UNICODE_EMOJI: continue #t = token.surface l.append(t) l = [s.replace(' ', '') for s in l] l = [s for s in l if len(s) > 0] s = " ".join(l) return s
def janome_tokenizer(sentence): t = Janome_Tokenizer() sentence = sentence.decode("utf-8") try: tokens = t.tokenize(sentence) except: try: tokens = t.tokenize(sentence.replace(u"\xa0", u"、")) except: try: tokens = t.tokenize(sentence.replace(u"\xa0", u"")) except: print ("Tokenization error at sentence: "+sentence.encode("utf-8")) return [sentence] return [dic.surface.encode("utf-8") for dic in tokens]
def make_word2vec(sentence, word_list): """ 1文とword_listを受け取り、word_listに含まれている、かつ、動詞形容詞名刺のみを分散表現にしてそれらの 平均をとったベクトルを返す """ # まずは文を携帯素解析 t = Tokenizer() W2V_sum=np.zeros(300) # 単語の数をカウントする counter = 0 for token in t.tokenize(sentence, stream=True): hinshi = token.part_of_speech.split(',')[0] # 動詞、形容詞または数字以外の名詞 if(hinshi == "動詞" or hinshi == "形容詞" or (hinshi == "名詞" and token.part_of_speech.split(',')[1] != "数")): # 語彙リストに含まれている単語のみ if(token.base_form in word_list): # 分散表現の辞書の中になかったら、それは無視する. try: counter += 1 W2V_sum += W2V_model[token.base_form] except: continue #平均ベクトルを求める W2V_ave = W2V_sum / float(counter) return (W2V_ave)
class JanomeTokenizer(object): def __init__(self, user_dic_path='', user_dic_enc='utf8'): self._t = Tokenizer(udic=user_dic_path, udic_enc=user_dic_enc) def wakati(self, sent): words = [token.surface for token in self.tokenize(sent)] return words def wakati_baseform(self, sent): words = [ token.base_form if token.base_form != '*' else token.surface for token in self.tokenize(sent) ] return words def tokenize(self, sent): token = namedtuple( 'Token', 'surface, pos, pos_detail1, pos_detail2, pos_detail3,\ infl_type, infl_form, base_form, reading, phonetic' ) for t in self._t.tokenize(sent): poses = t.part_of_speech.split(',') yield token(t.surface, poses[0], poses[1], poses[2], poses[3], t.infl_type, t.infl_form, t.base_form, t.reading, t.phonetic) def filter_by_pos(self, sent, pos=('名詞', )): tokens = [token for token in self.tokenize(sent) if token.pos in pos] return tokens
def tokenize(text): t = Tokenizer() #delete header and footer text = re.split(r'\-{5,}', text)[2] text = re.split(r'底本:', text)[0] text = text.strip() #delete kana(japanese character) text = text.replace('|', '') text = re.sub(r'《.+?》', '', text) #delete comments text = re.sub(r'[#.+?]', '', text) #process a line and a line lines = text.split("\r\n") results = [] for line in lines: res = [] tokens = t.tokenize(line) for tok in tokens: bf = tok.base_form if bf == "*": bf = tok.surface ps = tok.part_of_speech hinsi = ps.split(',')[0] if hinsi in [ '名詞', '動詞', '形容詞', '記号' ]: #add other parts, if you want to split sentences more types res.append(bf) l = " ".join(res) results.append(l) return results
def analyze(text): t = Tokenizer() tokens = t.tokenize(text) result = [] for token in tokens: result.append([token.surface, token.part_of_speech]) return (result)
def parser(value_str, tag=u"名詞"): t = Tokenizer() res = t.tokenize(value_str) if isinstance(tag, list): return [token.surface for token in res if token.part_of_speech.split(",")[0] in tag] else: return [token.surface for token in res if token.part_of_speech.split(",")[0] == tag]
def Wakachi(self, sentence): #文字列を分かち書きする words = [] t = Tokenizer() for token in t.tokenize(sentence): #janomeで分かち書きをする. words.append(token.surface) Wakachi_sentence = ' '.join(words) return Wakachi_sentence
def tokenize(text): t = Tokenizer() # テキストの先頭にあるヘッダとフッタを削除 text = re.split(r'\-{5,}', text)[2] text = re.split(r'底本:', text)[0] text = text.strip() # ルビを削除 text = text.replace('|', '') text = re.sub(r'《.+?》', '', text) # テキスト内の脚注を削除 text = re.sub(r'[#.+?]', '', text) # 一行ずつ処理 lines = text.split("\r\n") results = [] for line in lines: res = [] tokens = t.tokenize(line) for tok in tokens: bf = tok.base_form # 基本系 if bf == "*": bf = tok.surface ps = tok.part_of_speech # 品詞情報 hinsi = ps.split(',')[0] if hinsi in ['名詞', '動詞', '形容詞', '記号']: res.append(bf) l = " ".join(res) results.append(l) return results
def listen_func(message): t = Tokenizer() tokens = t.tokenize(u'pythonの本を読んだ') res = "" for token in tokens: res += str(token) + os.linesep message.send("```" + res + "```")
def text_to_array_ja(textdata, wordtypes): textdata = filter(textdata) t = Tokenizer() tokens = t.tokenize(textdata) words = sorted([token.surface for token in tokens if token.part_of_speech.split(',')[0] in wordtypes]) return words
def _tokenize(text): from collections import namedtuple Token = namedtuple("Token", ["t", "surface", "pos"]) t = Tokenizer() tokens = t.tokenize(text) for t in tokens: nt = Token(t, t.surface, t.part_of_speech.split(",")) yield nt
def makekeywords(text): from janome.tokenizer import Tokenizer t = Tokenizer() tokens = t.tokenize(text) keywords = [] for token in tokens: if token.part_of_speech.find("名詞") >= 0 and token.part_of_speech.find("数") == -1 and token.part_of_speech.find("非自立") == -1 and token.part_of_speech.find("接尾") == -1: keywords.append(token.surface) return keywords
def output_ja_text(data, wordtypes): textdata = filter(data) t = Tokenizer() tokens = t.tokenize(textdata) words = sorted([token.surface for token in tokens if token.part_of_speech.split(',')[0] in wordtypes]) dictionary = count_words(words) return pyaml.dump(dictionary, sys.stdout, vspacing=[0, 1])
def split(self, text): result = [] t = Tokenizer() malist = t.tokenize(text) for w in malist: sf = w.surface # 区切られた単語そのまま bf = w.base_form # 単語の基本形 if bf == '' or bf == "*": bf = sf result.append(bf) return result
def test_func(): t = Tokenizer() temp = "" for token in t.tokenize(u'この腫瘍は間葉系組織から生ずると考えられ、ビメンチンを発現する。'): if (not re.search('^(助詞|助動詞|記号)', token.part_of_speech)): temp = temp + token.surface else: temp = temp + token.surface print(temp) temp = ""
def understand_move(self, text): generator = Tokenizer() tokens = [] for t in generator.tokenize(text): tokens.append(t) direction = self._understand_direction(tokens) distance = self._understand_distance(tokens) return direction, distance
class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome library: " "https://github.com/mocobeta/janome") self.tokenizer = Tokenizer() def __call__(self, text): words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words))
class MyWindow(QWidget, Ui_window): loaded_words = Signal(str) refresh_words = Signal() def __init__(self): QWidget.__init__(self) self.setupUi(self) self.setWindowTitle( QApplication.translate("Widget", "%s %s" % (__app_name__, __version__), None, QApplication.UnicodeUTF8)) self.jacome_token = Tokenizer() self.words_container = [] @Slot() def analysis_janome(self, s): # デバッグ用に解析文章をstdout print s # 解析結果の表示要素を初期化(クリア) self.refresh_words.emit() # 解析結果格納配列を初期化(csv保存用) self.words_container = [] # 形態素解析を実行 tokens = self.jacome_token.tokenize(s) for token in tokens: # 解析結果をstring(UNICODE)型へキャスト print_str = str(token).decode('utf8') # csv保存用に解析結果を格納 self.words_container.append(print_str) # 解析結果を出力 self.loaded_words.emit('\n'.join(self.words_container)) @Slot() def save_csv(self): filename = 'result.csv' filename = os.path.normpath(filename) # OSを判定してエンコードを設定 if os.name is 'nt': code = 'cp932' else: code = 'utf-8' print 'save_csv: code = %s' % code with open(filename, 'wb') as f: # 'wb'じゃないと変な改行入る。 writer = csv.writer(f, delimiter=',') for words in self.words_container: out_word = words.encode(code) writer.writerow([out_word])
def main(): """ >>> main() すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ も 助詞,係助詞,*,*,*,*,も,モ,モ もも 名詞,一般,*,*,*,*,もも,モモ,モモ も 助詞,係助詞,*,*,*,*,も,モ,モ もも 名詞,一般,*,*,*,*,もも,モモ,モモ の 助詞,連体化,*,*,*,*,の,ノ,ノ うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ """ t = Tokenizer() for token in t.tokenize(u'すもももももももものうち'): print(token)
def get_morphs(string): t = Tokenizer() dicts=[] for token in t.tokenize(unicode(string, 'utf-8')): dic = {} token_list = str(token).replace(" ", ",").split(",") dic["surface"] = token_list[0] dic["base"] = token_list[7] dic["pos"] = token_list[1] dic["pos1"] = token_list[2] dicts.append(dic) return dicts
def do_analysis(analyzed_file): result_file_name = "result/" + str(analyzed_file) p = re.compile(r'\s(.*)') t = Tokenizer() with open(analyzed_file,mode='r', encoding='utf-8') as read_file: texts = read_file.read() with open(result_file_name, mode='a', encoding='utf-8') as result_file: for token in t.tokenize(str(texts)): check_word = p.sub('',str(token)) if not word_match.word_check(check_word): result_file.write(str(token) + "\n") return result_file_name
def analyze_keyword(posts): """ 投稿を形態素解析して頻出ワードで重み付けして キーワードから出現数と投稿の逆索引を生成する。 :param posts: dict{int: Posted} :rtype: list(KeywordReverseIndex) """ t = Tokenizer() tfidf2 = defaultdict(int) tfidf2_post = defaultdict(list) # 単語毎の重み付け for key in posts: post = posts[key] for message in post.parse_post_message: # Aタグ排除 soup = BeautifulSoup(message, "lxml") # janome _prev_token = None try: for token in t.tokenize(soup.text): # tokenが助詞なら相手しない if final_filter(_prev_token, token): tfidf2[_prev_token.surface + token.surface] += 1 if post not in tfidf2_post[_prev_token.surface + token.surface]: tfidf2_post[_prev_token.surface + token.surface] += [post] _prev_token = token # tokenが助詞ならtfidf2の先頭文字から除外 if token_is_sub(token): _prev_token = None except: pass # 逆索引の生成 r_indexes = [] for key in tfidf2: _index = KeywordReverseIndex(key, tfidf2[key], tfidf2_post[key]) # 出現数が一定以上のキーワードのみindexを生成する if _index.is_enable: r_indexes.append(_index) return r_indexes
def callback(): messages = request.json['result'] for message in messages: text = message['content']['text'] for matcher, action in commands: if matcher.search(text): response = action(text) break else: post_text(message['content']['from'], '解析中...') # 形態素解析 response = '' t = Tokenizer() for token in t.tokenize(message['content']['text']): response += str(token) + '\n' post_text(message['content']['from'], response) return ''
class Mave(object): def __init__(self, name=u'メイ'): self.name = name self.msg_que = Queue() self.tokenizer = Tokenizer() self.markov = Markov(ngram=2) def wakeUp(self): try: self.markov.load(u'mave_%s.json' % self.name) except Exception as e: print 'markov load failure' print e self.markov = Markov(ngram=2) def goToBed(self): self.markov.save(u'mave_%s.json' % self.name) def listenTo(self, message, talker): tokens = self.tokenizer.tokenize(message.decode('utf-8')) for tok in tokens: print '%10s (%10s) ... %s' % (tok.surface, tok.reading, tok.part_of_speech) self.markov.learn(tokens) meishi_list = [tok.surface for tok in tokens if u'名詞' in tok.part_of_speech.split(',') and ((u'一般' in tok.part_of_speech.split(',') and not u'あ' <= tok.surface[0] <= u'ん') or u'固有名詞' in tok.part_of_speech.split(','))] key = random.choice(meishi_list) if len(meishi_list) != 0 else None rsp = self.markov.generate(key) if rsp != None: self.msg_que.put(rsp) else: self.msg_que.put('はいはい > %s' % talker) def speak(self): if self.msg_que.empty(): return None return self.msg_que.get()
def add_yomi(string): t = Tokenizer() tokens = t.tokenize(string) rstring = '' for token in tokens: s = token.surface r = token.reading while True: res = split_at_hiragana(s, r) if len(res) > 2: rstring += create_yomi(res[0], res[1]) s, r = res[2], res[3] else: break rstring += create_yomi(res[0], res[1]) return rstring
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer from janome.dic import UserDictionary import sysdic print('Compile user dictionary (MeCab IPADIC format)') user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections) user_dict.save("/tmp/userdic") t = Tokenizer("/tmp/userdic") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token) print('') print('Compile user dictionary (simplified format)') user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections) user_dict.save("/tmp/userdic_simple") t = Tokenizer("/tmp/userdic_simple") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token)
try: tmp = line.split("screen_name") # print tmp[1] tmpstr = str(tmp[1]) tmp = tmpstr.split(" ") print tmp[0] sc = tmp[0] except: pass try: t = Tokenizer() tokens = t.tokenize(line.decode("utf-8")) for token in tokens: tmpstr = str(token) print "tmpstr:" + tmpstr tmp = tmpstr.split(" ") print sc + ":" + tmp[0] # print tmp[0] + ":" + tmp2[0] # print token except: pass f.close
from janome.tokenizer import Tokenizer t = Tokenizer() tokens = t.tokenize("エンジニアか美容師の彼女が欲しいエンジニア") for token in tokens: partSpeech = token.part_of_speech.split(',')[0] if partSpeech == "名詞": print(token.surface) else: pass
# -*- coding: utf-8 -*- # Copyright [2015] [moco_beta] # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from janome.tokenizer import Tokenizer if __name__ == '__main__': import sys text = ''.join(sys.argv[1:]) t = Tokenizer() tokens = t.tokenize(text) for token in tokens: print(token)
def chunk_with_hira(istr, keep_katakana=False): t = Tokenizer() tokens = t.tokenize(istr) readings = [x.reading.decode('utf-8') for x in tokens] surfaces = [x.surface for x in tokens] pos = [] for token in tokens: p = token.part_of_speech.split(',')[0] if isinstance(p, unicode): pos.append(p) else: pos.append(p.decode('utf-8')) pos2 = [] for token in tokens: p = token.part_of_speech.split(',')[1] if isinstance(p, unicode): pos2.append(p) else: pos2.append(p.decode('utf-8')) rstr = u'' for i, z in enumerate(zip(readings, surfaces, pos, pos2)): r, s, p, p2 = z if r == u'*': if not keep_katakana: if re.match(TOKENS_KATAKANA, s): rstr += jctconv.kata2hira(s) + u' ' else: rstr += s + u' ' else: rstr += s + u' ' continue if i < len(pos) - 1: if pos[i] == u'助動詞' and pos[i+1] == u'助動詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'動詞' and pos[i+1] == u'助動詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'動詞' and pos[i+1] == u'助詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'動詞' and pos[i+1] == u'動詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'接頭詞' and pos[i+1] == u'名詞': rstr += jctconv.kata2hira(r) elif pos2[i] == u'代名詞' and pos2[i+1] == u'副助詞/並立助詞/終助詞': rstr += jctconv.kata2hira(r) elif pos2[i+1] == u'接尾': rstr += jctconv.kata2hira(r) else: rstr += jctconv.kata2hira(r) + u' ' elif i < len(pos) - 2: if pos[i] == u'助動詞' and pos[i+1] == u'助詞' and pos[i+2] == u'助詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'助動詞' and pos[i+1] == u'名詞' and pos[i+2] == u'助動詞': rstr += jctconv.kata2hira(r) else: rstr += jctconv.kata2hira(r) + u' ' else: rstr += jctconv.kata2hira(r) + u' ' return rstr
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer print(u'Tokenize (system dictionary)') t = Tokenizer() for token in t.tokenize(u'すもももももももものうち'): print(token) print('') print(u'Tokenize (mmap system dictionary)') t = Tokenizer(mmap=True) for token in t.tokenize(u'すもももももももものうち'): print(token) print('') print(u'Tokenize (wakati mode)') for token in t.tokenize(u'すもももももももものうち', wakati = True): print(token) print('') print(u'Tokenize with user dictionary') t = Tokenizer("user_ipadic.csv", udic_enc="utf8") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'): print(token) print('') print(u'Tokenize with user dictionary (wakati mode)') t = Tokenizer("user_ipadic.csv", udic_enc="utf8") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。', wakati = True): print(token)
import sys from janome.tokenizer import Tokenizer argvs = sys.argv f = open(argvs[1]) line = f.readline() while line: line = f.readline() print line try: t = Tokenizer() tokens = t.tokenize(line.decode('utf-8')) for token in tokens: print(token) except: pass f.close