def tokenize(self, sentence):
        search_tags = []
        t = Tokenizer()
        self.get_logger().info("\n[tokens]")
        for token in t.tokenize(sentence):
            self.get_logger().info(str(token))

        self.get_logger().info("\n[search_tags]")
        tokens = t.tokenize(sentence)
        i = 0
        for token in tokens:
            if token.part_of_speech.split(',')[0] == u'動詞':
                search_tags.append(token.base_form)
            elif token.part_of_speech.split(',')[0] == u'名詞' and \
                token.part_of_speech.split(',')[1] == u'接尾':
                search_tags.append(tokens[i - 1].surface + token.surface)
            elif token.part_of_speech.split(',')[0] == u'名詞':
                if token.base_form != '*':
                    search_tags.append(token.base_form)
                else:
                    search_tags.append(token.surface)
            i += 1

        search_tags.append(sentence)
        for search_tag in search_tags:
            self.get_logger().info(str(search_tag))

        return search_tags
Exemplo n.º 2
0
class JanomeParserV2:
    def __init__(self, dic_path=None, base_form=False):
        if dic_path:
            self.t = Tokenizer(dic_path, udic_enc="cp932")
        else:
            self.t = Tokenizer()
        self.base_form_key = base_form

    def parser(self, value_str, tag=u"名詞"):
        res = self.t.tokenize(value_str)

        if self.base_form_key:
            if isinstance(tag, list):
                return [token.base_form for token in res if token.part_of_speech.split(",")[0] in tag]
            else:
                return [token.base_form for token in res if token.part_of_speech.split(",")[0] == tag]
        else:
            if isinstance(tag, list):
                return [token.surface for token in res if token.part_of_speech.split(",")[0] in tag]
            else:
                return [token.surface for token in res if token.part_of_speech.split(",")[0] == tag]

    def tokens(self, value_str):
        res = self.t.tokenize(value_str)
        return [{token.surface: token.part_of_speech} for token in res if token.part_of_speech.split(",")[0]]

    def wakati(self, value_str):
        return self.t.tokenize(value_str, wakati=True)
Exemplo n.º 3
0
def FaaS_janome(url="",fields={}):
    ret=""
    if url=="":#fallback
        t = Tokenizer()#'./neologd'
        if 'speech' in fields:
            target=fields['speech'].translate(str.maketrans("\"\'\\/<>%`?;",'””¥_〈〉%”?;'))
            target=target.translate(str.maketrans(", ",'__'))
            for token in t.tokenize(target):
                ret+=token.part_of_speech.split(',')[0]+","
            del t;gc.collect();return ret.strip(',')
        if 'surface' in fields:
            target=fields['surface'].translate(str.maketrans("\"\'\\/<>%`?;",'””¥_〈〉%”?;'))
            target=target.translate(str.maketrans(", ",'__'))
            for token in t.tokenize(target):
                ret+=token.surface+","
            del t;gc.collect();return ret.strip(',')
        if 'phonetic' in fields:
            target=fields['phonetic'].translate(str.maketrans("\"\'\\/<>%`?;",'””¥_〈〉%”?;'))
            target=target.translate(str.maketrans(", ",'__'))
            for token in t.tokenize(target):
                ret+=token.phonetic+","
            del t;gc.collect();return ret.strip(',')
    https = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where(),headers={})
    try:html=https.request('POST',url,
    body=json.dumps(fields),headers={'Content-Type': 'application/json'})
    except: return "ERROR:invalid endpoint"
    return html.data.decode('utf-8').translate(str.maketrans("","","\"\'\\/<>%`?;"))#Not_secure_filename!
Exemplo n.º 4
0
def conv_str_to_kana(str_list, answer_list):

    #ファイルから読み込んだものをJanomeを使ってカタカナに変換
    t = Tokenizer()
    kana_list = []
    kana_ans_list = []
    for i in str_list:
        kana = ""
        for token in t.tokenize(i):
            if token.reading == "*":
                kana = kana + (token.base_form)
            else:
                kana = kana + (token.reading)
        kana_list.append(kana)

    for j in answer_list:
        kana_ans = ""
        for token in t.tokenize(j):
            if token.reading == "*":
                kana_ans = kana_ans + (token.base_form)
            else:
                kana_ans = kana_ans + (token.reading)
        kana_ans_list.append(kana_ans)

    return kana_list, kana_ans_list
Exemplo n.º 5
0
    def run(self, force=None):
        print('start')
        # 全サイト取得と重複排除
        sites = {}
        for site in Site.get_all():
            sites[site.url] = site

        # リストに対してignoreとkeywordマッチを排除
        sure = []
        for key in sites:
            site = sites[key]
            response = requests.get(site.subjects_url)
            assert (response.status_code == 200), response.text

            # parse
            data = list(response.text.split('\n'))
            for line in data:
                try:
                    _ = Subject(site, line)
                    sure.append(_)
                except:
                    pass

        print(sure)

        # リスト出力
        t = Tokenizer()
        r = defaultdict(int)
        r2 = defaultdict(list)
        r3 = defaultdict(int)
        for _sure in sure:
            try:
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        r[token.surface] += 1
                        r2[token.surface] += [_sure]
                        r3[token] += 0
            except:
                pass

        # sort
        sure = sorted(sure, key=lambda x: x.title)

        for _sure in sure:
            try:
                point = 0
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        point += r[token.surface]
                if not filter_title(point, _sure):
                    print(_sure.title, _sure.count_res)

            except:
                pass
Exemplo n.º 6
0
 def cntjp():
     m = Frame.m
     txt = m.get()
     root23 = tk.Tk()
     root23.title('Result(CounterJP)')
     t = Tokenizer()
     for token in t.tokenize(txt):
         print(token)
     c = collections.Counter(t.tokenize(txt, wakati=True))
     label23 = tk.Label(root23, text=c, font=16)
     label23.pack(fill="x")
     root23.mainloop()
Exemplo n.º 7
0
class JanomeTokenizer(Tokenizer):
    def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False):
        self.udic = udic
        self.udic_enc = udic_enc
        self.udic_type = udic_type
        self.max_unknown_length = max_unknown_length
        self.wakati = wakati
        self.mmap = mmap

        self.tagger = Janome(udic=self.udic, udic_enc=self.udic_enc, udic_type=self.udic_type,
                             max_unknown_length=self.max_unknown_length, wakati=self.wakati, mmap=self.mmap)

    def __getstate__(self):
        return {k: v for k, v in self.__dict__.items() if k != "tagger"}

    def __setstate__(self, state):
        self.__dict__.update(state)
        self.tagger = Janome(udic=self.udic, udic_enc=self.udic_enc, udic_type=self.udic_type,
                             max_unknown_length=self.max_unknown_length, wakati=self.wakati, mmap=self.mmap)
        self.tagger.tokenize('')

    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0,
                 start_char=0, tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), '%s is not unicode' % repr(value)

        token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)

        if not tokenize:
            token.original = token.text = value
            token.boost = 1.0
            if positions:
                token.pos = start_pos
            if chars:
                token.startchar = start_char
                token.endchar = start_char + len(value)
            yield token
        else:
            pos = start_pos
            for janome_token in self.tagger.tokenize(value):
                token.text = janome_token.surface
                token.boost = 1.0
                if keeporiginal:
                    token.original = token.text
                token.stopped = False
                if positions:
                    token.pos = pos
                    pos += 1
                if chars:
                    token.startchar = start_char + janome_token.start
                    token.endchar = token.startchar + len(janome_token.surface)
                yield token
Exemplo n.º 8
0
def wakati(text: str):
    """テキストを分かち書きにして頻出単語を求める
    """
    t = Tokenizer()
    # 単語頻度
    c = collections.Counter(t.tokenize(text, wakati=True))
    print(*c.most_common()[:15], sep='\n')
    print()

    # 特定の品詞のみ
    c = collections.Counter(token.base_form for token in t.tokenize(text)
                            if token.part_of_speech.startswith('名詞,固有名詞'))
    # 降順で表示
    print(*c.most_common()[:15], sep='\n')
Exemplo n.º 9
0
def generate_tweet(sentense):
    tokenizer = Tokenizer()

    # print(sentense)
    word_list = tokenizer.tokenize(sentense, wakati=True)
    part_list = [
        token.part_of_speech.split(',')[0]
        for token in tokenizer.tokenize(sentense)
    ]
    base_word_list = [
        token.base_form for token in tokenizer.tokenize(sentense)
    ]
    try:
        del word_list[word_list.index('#'):len(word_list)]
        del base_word_list[base_word_list.index('#'):len(base_word_list)]
    except:
        pass
    try:
        del word_list[word_list.index('https'):len(word_list)]
        del base_word_list[base_word_list.index('https'):len(base_word_list)]
    except:
        pass
    # print(word_list)
    # print(part_list)
    # print(base_word_list)
    part_cnt = 0
    cnt = 0
    gen_sentense = ''
    for (word, part) in zip(word_list, part_list):
        if part == '名詞':
            part_cnt += 1
        gen_sentense += word
        if part_cnt == part_list.count('名詞'):
            # print("cnt :{}".format(cnt))
            # print("word_list :{}".format(word_list))
            for i in range(cnt, len(word_list)):
                # print("i :{}".format(i))
                if part_list[i] == ('動詞') or part_list[i] == ('形容詞'):
                    for j in range(cnt + 1, i):
                        # print("j :{}".format(j))
                        # print("word_list[j] :{}".format(word_list[j]))
                        gen_sentense += word_list[j]
                    gen_sentense += (base_word_list[i] + "の")
                    break
            break
        cnt += 1
    gen_sentense += "って普通だよね……?" if random.random() > 0.1 else "って普通じゃなかったんだ……"
    # print(gen_sentense)

    return gen_sentense
Exemplo n.º 10
0
def _morphological_analysis(sentence):
    u"""
    一文を形態素解析する
    @param sentence 一文
    @return 形態素で分割された配列
    """
    morphemes = []
    t = Tokenizer()
    node = t.tokenize(sentence)
    result = []
    for token in t.tokenize(sentence):
        # print(token.surface)
        result.append(token.surface)
    return result
Exemplo n.º 11
0
class MainTranslator(object):
	def __init__ (self):
		self.janome= Tokenizer()
		
	def get_gobi(self, n):
		f = n.part_of_speech.split(',')	
		if n.surface in ['だ','です','た','だろ','ある']:
			if f[0] == '助動詞': 
				return 'ハゲ'
		
		if n.surface in ['無い','ない','ぬ']:
			if f[0] == '助動詞':
				return 'ぬハゲ' 
			if f[0] == '形容詞':
				return 'なしハゲ'
			
	
	def Translator(self, text):
		tokens = self.janome.tokenize(text)
		text = ''
		for n in tokens:
			f = n.part_of_speech.split(',')
			if n.surface in converter:
				text += converter[n.surface]
			elif len(f) > 3:
				gobi = self.get_gobi(n)
				if gobi is not None:
					text += gobi
				else:
					text += n.surface
			else:
				text += n.surface
		
		return text
Exemplo n.º 12
0
def tokenize(text):
    t = Tokenizer()
    # テキストの先頭にあるヘッダとフッタを削除
    text = re.split(r'\-{5,}',text)[2]
    text = re.split(r'底本:', text)[0]
    text = text.strip()
    # ルビを削除
    text = text.replace('|', '')
    text = re.sub(r'《.+?》', '', text)
    # テキスト内の脚注を削除
    text = re.sub(r'[#.+?]', '', text)
    # 一行ずつ処理
    lines = text.split("\r\n")
    results = []
    for line in lines:
        res = []
        tokens = t.tokenize(line)
        for tok in tokens:
            bf = tok.base_form # 基本系
            if bf == "*": bf = tok.surface
            ps = tok.part_of_speech # 品詞情報
            hinsi = ps.split(',')[0]
            if hinsi in ['名詞', '動詞', '形容詞', '記号']:
                res.append(bf)
        l = " ".join(res)
        results.append(l)
    return results
def separatewords(text):
    separatedWord=[]
    t=Tokenizer()
    tokens=t.tokenize(unicode(text, "utf-8"))
    
    for token in tokens:
        posList=token.part_of_speech.split(",")

        pos1=posList[0]
        if isinstance(pos1, unicode):
          pos1=pos1.encode("utf-8")

        pos2=posList[1]
        if isinstance(pos2, unicode):
          pos2=pos2.encode("utf-8")

        ruby=token.reading
        if isinstance(ruby, unicode):
          ruby=ruby.encode("utf-8")

        if pos1=="名詞":
            if pos2!="接尾" and pos2!="代名詞" and pos2!="非自立" and pos2!="数" and pos2!="形容動詞語幹":
                if ruby!="*":
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()
                elif pos2!="サ変接続" and len(token.surface)>3:
                    # 英単語に関しては4文字以上の単語を扱う
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()

    return separatedWord
def generateToken(df):
    df_token = pd.DataFrame(index=[],
                            columns=[
                                'meeting_no', 'meeting_date',
                                'monitoring_index', 'line_number', 'token',
                                'part_of_speech', 'part_of_speech2',
                                'part_of_speech3', 'part_of_speech4',
                                'infl_type', 'base_form'
                            ])
    for row in df.itertuples(name=None):
        meeting_no = row[1]
        meeting_date = row[2]
        monitoring_index = row[3]
        line_number = row[4]
        t = Tokenizer()
        tokens = t.tokenize(row[5])
        for token in tokens:
            if not re.search(r'[、。I,%%~~##※\\\(\)\.\-\/]',
                             token.surface) and token.surface not in [
                                 'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ'
                             ]:
                word_category = token.part_of_speech.split(',')[0]
                word_type = token.part_of_speech.split(',')[1]
                if word_category == '名詞' and word_type != '数' and word_type != '代名詞' and word_type != '非自立' and word_type != '接尾':
                    df_token.loc[len(df_token.index)] = [
                        meeting_no, meeting_date, monitoring_index,
                        line_number, token.surface
                    ] + token.part_of_speech.split(',') + [
                        token.infl_type, token.base_form
                    ]
    df_token = df_token.replace('*', np.nan)
    return df_token
Exemplo n.º 15
0
 def tokenize(self, text):
     tokens = []
     t = Tokenizer()
     pre_tokens = t.tokenize(text)
     for token in pre_tokens:
         tokens.append(token.surface)
     return tokens
Exemplo n.º 16
0
def get_token(text):
    t = Tokenizer()
    tokens = t.tokenize(text, wakati=True)  # 分かち書きする
    word = ""
    for token in tokens:
        word += token + " "
    return word
Exemplo n.º 17
0
class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            raise ImportError("The Japanese tokenizer requires the Janome "
                              "library: https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()

    def __call__(self, text):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **exclude):
        return b''

    def from_bytes(self, bytes_data, **exclude):
        return self

    def to_disk(self, path, **exclude):
        return None

    def from_disk(self, path, **exclude):
        return self
Exemplo n.º 18
0
def main():
    # コマンドラインからファイル名を取得 (csvのみ)
    FILENAME = sys.argv[1]

    # 前処理するデータの読み込み
    df = pd.read_csv('dataset/original/' + FILENAME)

    t = Tokenizer()
    wakati = []  # 分かち書きしたタイトルを入れるための変数
    length = []  # 1タイトルに含まれるキーワードの数を入れる

    # df内のデータを全て分かち書きする
    for i in range(len(df)):
        tmp = df['title'][i]
        tmp = [
            token.surface for token in t.tokenize(tmp)
            if ((token.part_of_speech.split(',')[0] in [
                '形容詞',
                '副詞',
                '名詞',
            ]) or ((token.part_of_speech.split(',')[0] in ['動詞'])
                   & (token.part_of_speech.split(',')[1] not in ['接尾'])))
        ]
        length.append(len(tmp))
        wakati.append(tmp)

    df_wakati = pd.DataFrame({"title": wakati, "len": length})
    df_wakati.to_csv("dataset/preprocessed/pre_" + FILENAME)
Exemplo n.º 19
0
def annotate_using_janome(sentences, tokenize=False):
    assert tokenize, 'no support for using janome with pre-tokenized inputs'
    try:
        from janome.tokenizer import Tokenizer
    except ImportError:
        logger.error(
            'failed to import janome. please install it by "pip install janome".'
        )
        exit(1)

    logger.info('use Janome to tokenize and annotate POS infos.')
    tokenizer = Tokenizer()
    res = []
    raw_sentences = []
    for sentence in sentences:
        sentence = ''.join(sentence)
        tokenized = list(tokenizer.tokenize(sentence))
        tokens = []
        for token in tokenized:
            pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
            token = Token(word=token.surface,
                          surf=token.surface,
                          pos=pos,
                          pos1=pos1,
                          pos2=pos2,
                          pos3=pos3,
                          inflectionForm=token.infl_form,
                          inflectionType=token.infl_type,
                          reading=token.reading,
                          base=token.base_form)
            tokens.append(token)
        raw_sentence = [token.surface for token in tokenized]
        res.append(tokens)
        raw_sentences.append(raw_sentence)
    return res, raw_sentences
Exemplo n.º 20
0
def conv_str_to_kana(str_list):

    #ファイルから読み込んだものをJanomeを使ってカタカナに変換
    """
        str_list=タイトル
        ファイルから読み込んだものをJanomeを使ってカタカナに変換する
    Args:
        str_list str:
    
    Returns:
      [str]:
            kana_list=カタカナ
            
    """
    t = Tokenizer()
    kana_list = []
    for i in str_list:
        kana = ""
        for token in t.tokenize(i):
            if token.reading == "*":
                kana = kana + (token.base_form)
            else:
                kana = kana + (token.reading)
        kana_list.append(kana)

    return kana_list
Exemplo n.º 21
0
def chunk_with_kanji(istr):
    t = Tokenizer()
    tokens = t.tokenize(istr)

    # give each element flags (jiritsu or fuzoku)
    flags = [judge_jifu(x.part_of_speech) for x in tokens]
    
    surface = [x.surface for x in tokens]

    # split to chunks, delimited by KUGIRI flag
    # very ugly. should be rewritten using tree structure etc.
    cflags = insert_chunkflg(flags)
    rstr = u""
    i = 0
    for j, f in enumerate(flags):
        if i >= len(cflags): break
        if cflags[i] == KUGIRI:
            if f == KUTOU: 
                rstr += surface[j]
                i += 1
            else:
                rstr += u" "
                rstr += surface[j]
                i += 2
        else:
            rstr += surface[j]
            i += 1

    # don't know why this is necessary
    if flags != [] and j == 0 and len(surface) != 1: 
        while j  < len(surface):
            rstr += surface[j]    
            j += 1

    return rstr
Exemplo n.º 22
0
def text_tokenize(from_file, to_file, encoding='utf-8'):
    """

    :param from_file:
    :param to_file:
    :param encoding:
    :return:
    """
    _original = from_text(from_file, mode='r', encoding=encoding)

    # remove useless characters
    _process = re.sub("《[^》]+》", "", _original)  # ルビの削除
    _process = re.sub("[[^]]+]", "", _process)  # 読みの注意の削除
    _process = re.sub("[|  「」\n]", "", _process)  # | と全角半角スペース、「」と改行の削除

    # split in sentences
    seperator = "。"  # 。をセパレータに指定
    _process_list = _process.split(seperator)  # セパレーターを使って文章をリストに分割する
    _process_list.pop(
    )  # 最後の要素は空の文字列になるので、削除 / erase the last element of the list because he's empty
    processed_list = [x + seperator for x in _process_list]  # 文章の最後に。を追加

    # Janome tokenize
    t = Tokenizer()

    processed_words = []
    for sentence in processed_list:
        processed_words.append(t.tokenize(sentence,
                                          wakati=True))  # 文章ごとに単語に分割し、リストに格納

    save_pickle(to_file, processed_words)
Exemplo n.º 23
0
def get_words(text, customdict=None):
    """ 与えられたテキストを形態素解析して、含まれる名詞のリストを返す """

    def _filter(s):
        """ 名詞だけにフィルタリングする """
        reg = re.compile(r'名詞')
        ignore_reg = re.compile(r'非自立')
        if (reg.search(s.part_of_speech) and
                not ignore_reg.search(s.part_of_speech)):
            return True

    if customdict:
        t = Tokenizer(customdict)
    else:
        t = DEFAULT_TOKENIZER

    __word = ''
    __results = []
    __text = re.sub(r'[()!?.,]+', ' ', unicodedata.normalize('NFKC', text))
    for s in t.tokenize(__text):
        if _filter(s):
            __word += s.surface
        elif __word:
            __results.append(__word)
            __word = ''
    if __word:
        __results.append(__word)
    return __results
Exemplo n.º 24
0
def parse(text):
    t = Tokenizer()
    tokens = t.tokenize(text)
    result = []
    for token in tokens:
        result.append(token.surface)
    return (result)
Exemplo n.º 25
0
def default_func(message):
    #f = open("plugins_difficult/polarity.yml", "r+")
    #polarity = yaml.load(f)

    text = message.body['text']  # メッセージを取り出す
    # 送信メッセージを作る。改行やトリプルバッククォートで囲む表現も可能
    t = Tokenizer()
    #m = MeCab.Tagger ("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    tokens = t.tokenize(text)
    #msg = 'あなたの送ったメッセージをmecabで解析します。\n```' + m.parse(text) + '```'
    pol_val = 0
    for token in tokens:
        word = token.surface
        #品詞を取得
        pos = token.part_of_speech.split(',')[0]
        if word in polarity:
            pol_val = pol_val + float(polarity[word])
        #print('{0} , {1}'.format(word, pos))
        #次の単語に進める
    message.reply("```Sentence you input is " + text +
                  ". Sentence polarity is " + str(pol_val) + "```")  # メンション
    #message.send("Sentence tag is"+','.join(tags)+"```")
    if pol_val > 0.2:
        message.react('+1')
        message.reply("それはいいね!!")
    elif pol_val < -0.2:
        message.react('cry')
        message.reply("そうか、どんまい")
    else:
        message.reply("なるほど、そうなんですね〜")
Exemplo n.º 26
0
def split_for_markovify(text):
    """split text to sentences by newline, and split sentence to words by space.
    """
    # separate words using janome
    tagger = Tokenizer()
    splitted_text = ""

    # these chars might break markovify
    # https://github.com/jsvine/markovify/issues/84
    breaking_chars = [
        '(',
        ')',
        '[',
        ']',
        '"',
        "'",
    ]

    # split whole text to sentences by newline, and split sentence to words by space.
    for line in text.split():
        for token in tagger.tokenize(line):
            try:
                if token.surface not in breaking_chars:
                    splitted_text += token.surface  # skip if node is markovify breaking char
                if token.surface != '。' and token.surface != '、':
                    splitted_text += ' '  # split words by space
                if token.surface == '。':
                    splitted_text += '\n'  # reresent sentence by newline
            except UnicodeDecodeError as e:
                # sometimes error occurs
                print(line)

    return splitted_text
Exemplo n.º 27
0
def get_wakati_ja(s):
    """
    ==usagge==
    df["ja_new_text"] = df["ja_text"].apply(get_wakati_ja)
    """
    tknz = Tokenizer()

    UNICODE_EMOJI = []
    for lang, value in emoji.UNICODE_EMOJI.items():
        UNICODE_EMOJI += list(value.keys())
    UNICODE_EMOJI = list(set(UNICODE_EMOJI))

    l = []
    speech = ["名詞", "動詞", "形容詞"]
    code_regex = re.compile(
        '[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】&*・()$#@。、?!`+¥%]')
    s = code_regex.sub(' ', s)
    for token in tknz.tokenize(s):
        if token.part_of_speech.split(",")[0] in speech:
            t = token.base_form
            t = t.replace(' ', '')
            if len(t) == 0:
                continue
            if len(t) == 1:
                if len(re.findall('[\u3041-\u309F]+', t)) > 0:
                    continue
            if t in UNICODE_EMOJI:
                continue
            #t = token.surface
            l.append(t)
    l = [s.replace(' ', '') for s in l]
    l = [s for s in l if len(s) > 0]
    s = " ".join(l)
    return s
def janome_tokenizer(sentence):
    t = Janome_Tokenizer()
    sentence = sentence.decode("utf-8")
    try:
        tokens = t.tokenize(sentence)
    except:
        try:
            tokens = t.tokenize(sentence.replace(u"\xa0", u"、"))
        except:
            try:
                tokens = t.tokenize(sentence.replace(u"\xa0", u""))
            except:
                print ("Tokenization error at sentence: "+sentence.encode("utf-8"))
                return  [sentence]

    return [dic.surface.encode("utf-8") for dic in tokens]
Exemplo n.º 29
0
def make_word2vec(sentence, word_list):

	"""
	1文とword_listを受け取り、word_listに含まれている、かつ、動詞形容詞名刺のみを分散表現にしてそれらの
	平均をとったベクトルを返す
	"""

	# まずは文を携帯素解析
	t = Tokenizer()
	W2V_sum=np.zeros(300)
	# 単語の数をカウントする
	counter = 0

	for token in t.tokenize(sentence, stream=True):
		hinshi = token.part_of_speech.split(',')[0]

		# 動詞、形容詞または数字以外の名詞
		if(hinshi == "動詞" or hinshi == "形容詞" or (hinshi == "名詞" and token.part_of_speech.split(',')[1] != "数")):

			# 語彙リストに含まれている単語のみ
			if(token.base_form in word_list):
				# 分散表現の辞書の中になかったら、それは無視する.
				try:
					counter += 1
					W2V_sum += W2V_model[token.base_form]
				except:
					continue

	#平均ベクトルを求める
	W2V_ave = W2V_sum / float(counter)
	return (W2V_ave)
class JanomeTokenizer(object):
    def __init__(self, user_dic_path='', user_dic_enc='utf8'):
        self._t = Tokenizer(udic=user_dic_path, udic_enc=user_dic_enc)

    def wakati(self, sent):
        words = [token.surface for token in self.tokenize(sent)]
        return words

    def wakati_baseform(self, sent):
        words = [
            token.base_form if token.base_form != '*' else token.surface
            for token in self.tokenize(sent)
        ]
        return words

    def tokenize(self, sent):
        token = namedtuple(
            'Token', 'surface, pos, pos_detail1, pos_detail2, pos_detail3,\
                                             infl_type, infl_form, base_form, reading, phonetic'
        )
        for t in self._t.tokenize(sent):
            poses = t.part_of_speech.split(',')
            yield token(t.surface, poses[0], poses[1], poses[2], poses[3],
                        t.infl_type, t.infl_form, t.base_form, t.reading,
                        t.phonetic)

    def filter_by_pos(self, sent, pos=('名詞', )):
        tokens = [token for token in self.tokenize(sent) if token.pos in pos]
        return tokens
def tokenize(text):
    t = Tokenizer()
    #delete header and footer
    text = re.split(r'\-{5,}', text)[2]
    text = re.split(r'底本:', text)[0]
    text = text.strip()
    #delete kana(japanese character)
    text = text.replace('|', '')
    text = re.sub(r'《.+?》', '', text)
    #delete comments
    text = re.sub(r'[#.+?]', '', text)
    #process a line and a line
    lines = text.split("\r\n")
    results = []
    for line in lines:
        res = []
        tokens = t.tokenize(line)
        for tok in tokens:
            bf = tok.base_form
            if bf == "*": bf = tok.surface
            ps = tok.part_of_speech
            hinsi = ps.split(',')[0]
            if hinsi in [
                    '名詞', '動詞', '形容詞', '記号'
            ]:  #add other parts, if you want to split sentences more types
                res.append(bf)
        l = " ".join(res)
        results.append(l)
    return results
Exemplo n.º 32
0
def analyze(text):
    t = Tokenizer()
    tokens = t.tokenize(text)
    result = []
    for token in tokens:
        result.append([token.surface, token.part_of_speech])
    return (result)
Exemplo n.º 33
0
def parser(value_str, tag=u"名詞"):
    t = Tokenizer()
    res = t.tokenize(value_str)
    if isinstance(tag, list):
        return [token.surface for token in res if token.part_of_speech.split(",")[0] in tag]
    else:
        return [token.surface for token in res if token.part_of_speech.split(",")[0] == tag]
Exemplo n.º 34
0
 def Wakachi(self, sentence):  #文字列を分かち書きする
     words = []
     t = Tokenizer()
     for token in t.tokenize(sentence):  #janomeで分かち書きをする.
         words.append(token.surface)
     Wakachi_sentence = ' '.join(words)
     return Wakachi_sentence
Exemplo n.º 35
0
def tokenize(text):
    t = Tokenizer()
    # テキストの先頭にあるヘッダとフッタを削除
    text = re.split(r'\-{5,}', text)[2]
    text = re.split(r'底本:', text)[0]
    text = text.strip()
    # ルビを削除
    text = text.replace('|', '')
    text = re.sub(r'《.+?》', '', text)
    # テキスト内の脚注を削除
    text = re.sub(r'[#.+?]', '', text)
    # 一行ずつ処理
    lines = text.split("\r\n")
    results = []
    for line in lines:
        res = []
        tokens = t.tokenize(line)
        for tok in tokens:
            bf = tok.base_form  # 基本系
            if bf == "*": bf = tok.surface
            ps = tok.part_of_speech  # 品詞情報
            hinsi = ps.split(',')[0]
            if hinsi in ['名詞', '動詞', '形容詞', '記号']:
                res.append(bf)
        l = " ".join(res)
        results.append(l)
    return results
Exemplo n.º 36
0
def listen_func(message):
    t = Tokenizer()
    tokens = t.tokenize(u'pythonの本を読んだ')
    res = ""
    for token in tokens:
        res += str(token) + os.linesep
    message.send("```" + res + "```")
Exemplo n.º 37
0
def text_to_array_ja(textdata, wordtypes):
    textdata = filter(textdata)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    return words
Exemplo n.º 38
0
def _tokenize(text):
    from collections import namedtuple
    Token = namedtuple("Token", ["t", "surface", "pos"])

    t = Tokenizer()
    tokens = t.tokenize(text)
    for t in tokens:
        nt = Token(t, t.surface, t.part_of_speech.split(","))
        yield nt
Exemplo n.º 39
0
def makekeywords(text):
    from janome.tokenizer import Tokenizer
    t = Tokenizer()
    tokens = t.tokenize(text)
    keywords = []
    for token in tokens:
        if token.part_of_speech.find("名詞") >= 0 and token.part_of_speech.find("数") == -1 and token.part_of_speech.find("非自立") == -1 and token.part_of_speech.find("接尾") == -1:
            keywords.append(token.surface)
    return keywords
Exemplo n.º 40
0
def output_ja_text(data, wordtypes):
    textdata = filter(data)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    dictionary = count_words(words)
    return pyaml.dump(dictionary, sys.stdout, vspacing=[0, 1])
Exemplo n.º 41
0
 def split(self, text):
     result = []
     t = Tokenizer()
     malist = t.tokenize(text)
     for w in malist:
         sf = w.surface   # 区切られた単語そのまま 
         bf = w.base_form # 単語の基本形
         if bf == '' or bf == "*": bf = sf
         result.append(bf)
     return result
Exemplo n.º 42
0
Arquivo: main.py Projeto: okkn/Omsoba
def test_func():
    t = Tokenizer()
    temp = ""
    for token in t.tokenize(u'この腫瘍は間葉系組織から生ずると考えられ、ビメンチンを発現する。'):
        if (not re.search('^(助詞|助動詞|記号)', token.part_of_speech)):
            temp = temp + token.surface
        else:
            temp = temp + token.surface
            print(temp)
            temp = ""
Exemplo n.º 43
0
    def understand_move(self, text):
        generator = Tokenizer()
        tokens = []

        for t in generator.tokenize(text):
            tokens.append(t)

        direction = self._understand_direction(tokens)
        distance = self._understand_distance(tokens)

        return direction, distance
Exemplo n.º 44
0
class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            raise ImportError("The Japanese tokenizer requires the Janome library: "
                              "https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()

    def __call__(self, text):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))
Exemplo n.º 45
0
class MyWindow(QWidget, Ui_window):
    loaded_words = Signal(str)
    refresh_words = Signal()

    def __init__(self):
        QWidget.__init__(self)
        self.setupUi(self)
        self.setWindowTitle(
            QApplication.translate("Widget", "%s %s" % (__app_name__, __version__), None, QApplication.UnicodeUTF8))

        self.jacome_token = Tokenizer()
        self.words_container = []

    @Slot()
    def analysis_janome(self, s):
        # デバッグ用に解析文章をstdout
        print s

        # 解析結果の表示要素を初期化(クリア)
        self.refresh_words.emit()
        # 解析結果格納配列を初期化(csv保存用)
        self.words_container = []
        # 形態素解析を実行
        tokens = self.jacome_token.tokenize(s)

        for token in tokens:
            # 解析結果をstring(UNICODE)型へキャスト
            print_str = str(token).decode('utf8')
            # csv保存用に解析結果を格納
            self.words_container.append(print_str)

        # 解析結果を出力
        self.loaded_words.emit('\n'.join(self.words_container))

    @Slot()
    def save_csv(self):
        filename = 'result.csv'
        filename = os.path.normpath(filename)

        # OSを判定してエンコードを設定
        if os.name is 'nt':
            code = 'cp932'
        else:
            code = 'utf-8'
        print 'save_csv: code = %s' % code

        with open(filename, 'wb') as f:  # 'wb'じゃないと変な改行入る。
            writer = csv.writer(f, delimiter=',')
            for words in self.words_container:
                out_word = words.encode(code)
                writer.writerow([out_word])
Exemplo n.º 46
0
def main():
    """
    >>> main()
    すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
    も	助詞,係助詞,*,*,*,*,も,モ,モ
    もも	名詞,一般,*,*,*,*,もも,モモ,モモ
    も	助詞,係助詞,*,*,*,*,も,モ,モ
    もも	名詞,一般,*,*,*,*,もも,モモ,モモ
    の	助詞,連体化,*,*,*,*,の,ノ,ノ
    うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
    """
    t = Tokenizer()
    for token in t.tokenize(u'すもももももももものうち'):
        print(token)
Exemplo n.º 47
0
def get_morphs(string):
    t = Tokenizer()
    dicts=[]
    for token in t.tokenize(unicode(string, 'utf-8')):
        dic = {}
        token_list = str(token).replace("	", ",").split(",")
        dic["surface"] = token_list[0]
        dic["base"] = token_list[7]
        dic["pos"] = token_list[1]
        dic["pos1"] = token_list[2]

        dicts.append(dic)

    return dicts
Exemplo n.º 48
0
def do_analysis(analyzed_file):
    result_file_name = "result/" + str(analyzed_file)

    p = re.compile(r'\s(.*)')

    t = Tokenizer()
    
    with open(analyzed_file,mode='r', encoding='utf-8') as read_file:
        texts = read_file.read()
    
    with open(result_file_name, mode='a', encoding='utf-8') as result_file:
        for token in t.tokenize(str(texts)):
            check_word = p.sub('',str(token))
            if not word_match.word_check(check_word):
                result_file.write(str(token) + "\n")
        
    return result_file_name
Exemplo n.º 49
0
def analyze_keyword(posts):
    """
    投稿を形態素解析して頻出ワードで重み付けして
    キーワードから出現数と投稿の逆索引を生成する。
    :param posts: dict{int: Posted}
    :rtype: list(KeywordReverseIndex)
    """
    t = Tokenizer()
    tfidf2 = defaultdict(int)
    tfidf2_post = defaultdict(list)

    # 単語毎の重み付け
    for key in posts:
        post = posts[key]
        for message in post.parse_post_message:
            # Aタグ排除
            soup = BeautifulSoup(message, "lxml")

            # janome
            _prev_token = None
            try:
                for token in t.tokenize(soup.text):
                    # tokenが助詞なら相手しない
                    if final_filter(_prev_token, token):
                        tfidf2[_prev_token.surface + token.surface] += 1
                        if post not in tfidf2_post[_prev_token.surface + token.surface]:
                            tfidf2_post[_prev_token.surface + token.surface] += [post]

                    _prev_token = token

                    # tokenが助詞ならtfidf2の先頭文字から除外
                    if token_is_sub(token):
                        _prev_token = None
            except:
                pass

    # 逆索引の生成
    r_indexes = []
    for key in tfidf2:
        _index = KeywordReverseIndex(key, tfidf2[key], tfidf2_post[key])

        # 出現数が一定以上のキーワードのみindexを生成する
        if _index.is_enable:
            r_indexes.append(_index)
    return r_indexes
Exemplo n.º 50
0
def callback():
    messages = request.json['result']

    for message in messages:
        text = message['content']['text']
        for matcher, action in commands:
            if matcher.search(text):
                response = action(text)
                break
        else:
            post_text(message['content']['from'], '解析中...')
            # 形態素解析
            response = ''
            t = Tokenizer()
            for token in t.tokenize(message['content']['text']):
                response += str(token) + '\n'
        post_text(message['content']['from'], response)
    return ''
Exemplo n.º 51
0
class Mave(object):
    def __init__(self, name=u'メイ'):
        self.name = name
        self.msg_que = Queue()
        self.tokenizer = Tokenizer()

        self.markov = Markov(ngram=2)

    def wakeUp(self):
        try:
            self.markov.load(u'mave_%s.json' % self.name)
        except Exception as e:
            print 'markov load failure'
            print e
            self.markov = Markov(ngram=2)

    def goToBed(self):
        self.markov.save(u'mave_%s.json' % self.name)


    def listenTo(self, message, talker):
        tokens = self.tokenizer.tokenize(message.decode('utf-8'))
        for tok in tokens:
            print '%10s (%10s) ... %s' % (tok.surface, tok.reading, tok.part_of_speech)

        self.markov.learn(tokens)

        meishi_list = [tok.surface for tok in tokens 
                       if u'名詞' in tok.part_of_speech.split(',') and
                          ((u'一般' in tok.part_of_speech.split(',') and  not u'あ' <= tok.surface[0] <= u'ん')
                           or u'固有名詞' in tok.part_of_speech.split(','))]

        key = random.choice(meishi_list) if len(meishi_list) != 0 else None
        rsp = self.markov.generate(key)
        if rsp != None:
            self.msg_que.put(rsp)
        else:
            self.msg_que.put('はいはい > %s' % talker)

    def speak(self):
        if self.msg_que.empty():
            return None

        return self.msg_que.get()
Exemplo n.º 52
0
def add_yomi(string):
    t = Tokenizer()
    tokens = t.tokenize(string)

    rstring = ''
    for token in tokens:
        s = token.surface
        r = token.reading

        while True:
            res = split_at_hiragana(s, r)
            if len(res) > 2:
                rstring += create_yomi(res[0], res[1])
                s, r = res[2], res[3]
            else:
                break

        rstring += create_yomi(res[0], res[1])

    return rstring
Exemplo n.º 53
0
# -*- coding: utf-8 -*-

from janome.tokenizer import Tokenizer
from janome.dic import UserDictionary
import sysdic

print('Compile user dictionary (MeCab IPADIC format)')
user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections)
user_dict.save("/tmp/userdic")

t = Tokenizer("/tmp/userdic")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)


print('')
print('Compile user dictionary (simplified format)')
user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections)
user_dict.save("/tmp/userdic_simple")

t = Tokenizer("/tmp/userdic_simple")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)
Exemplo n.º 54
0
    try:
        tmp = line.split("screen_name")
        # print tmp[1]

        tmpstr = str(tmp[1])
        tmp = tmpstr.split(" ")
        print tmp[0]
        sc = tmp[0]

    except:
        pass

    try:
        t = Tokenizer()
        tokens = t.tokenize(line.decode("utf-8"))

        for token in tokens:
            tmpstr = str(token)
            print "tmpstr:" + tmpstr
            tmp = tmpstr.split("  ")

            print sc + ":" + tmp[0]

            # print tmp[0] + ":" + tmp2[0]
            # print token

    except:
        pass

f.close
Exemplo n.º 55
0
from janome.tokenizer import Tokenizer
 
t = Tokenizer()
tokens = t.tokenize("エンジニアか美容師の彼女が欲しいエンジニア")
  
for token in tokens:
    partSpeech = token.part_of_speech.split(',')[0]
    if partSpeech == "名詞":
        print(token.surface)
    else:
        pass
Exemplo n.º 56
0
# -*- coding: utf-8 -*-

# Copyright [2015] [moco_beta]
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from janome.tokenizer import Tokenizer

if __name__ == '__main__':
    import sys
    text = ''.join(sys.argv[1:])

    t = Tokenizer()
    tokens = t.tokenize(text)
    for token in tokens:
        print(token)

Exemplo n.º 57
0
def chunk_with_hira(istr, keep_katakana=False):
    t = Tokenizer()
    tokens = t.tokenize(istr)

    readings = [x.reading.decode('utf-8') for x in tokens]
    surfaces = [x.surface for x in tokens]

    pos = []
    for token in tokens:
        p = token.part_of_speech.split(',')[0]
        if isinstance(p, unicode):
            pos.append(p)
        else:
            pos.append(p.decode('utf-8'))

    pos2 = []
    for token in tokens:
        p = token.part_of_speech.split(',')[1]
        if isinstance(p, unicode):
            pos2.append(p)
        else:
            pos2.append(p.decode('utf-8'))

    rstr = u''
    for i, z in enumerate(zip(readings, surfaces, pos, pos2)):
        r, s, p, p2 = z

        if r == u'*':
            if not keep_katakana:
                if re.match(TOKENS_KATAKANA, s):
                    rstr += jctconv.kata2hira(s) + u' '
                else:
                    rstr += s + u' '
            else:
                rstr += s + u' '
            continue

        if i < len(pos) - 1:
            if pos[i] == u'助動詞' and pos[i+1] == u'助動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'動詞' and pos[i+1] == u'助動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'動詞' and pos[i+1] == u'助詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'動詞' and pos[i+1] == u'動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'接頭詞' and pos[i+1] == u'名詞':
                rstr += jctconv.kata2hira(r)
            elif pos2[i] == u'代名詞' and pos2[i+1] == u'副助詞/並立助詞/終助詞':
                rstr += jctconv.kata2hira(r)
            elif pos2[i+1] == u'接尾':
                rstr += jctconv.kata2hira(r)
            else:
                rstr += jctconv.kata2hira(r) + u' '

        elif i < len(pos) - 2:
            if pos[i] == u'助動詞' and pos[i+1] == u'助詞' and pos[i+2] == u'助詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'助動詞' and pos[i+1] == u'名詞' and pos[i+2] == u'助動詞':
                rstr += jctconv.kata2hira(r)
            else:
                rstr += jctconv.kata2hira(r) + u' '

        else:
            rstr += jctconv.kata2hira(r) + u' '

    return rstr
Exemplo n.º 58
0
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer

print(u'Tokenize (system dictionary)')
t = Tokenizer()
for token in t.tokenize(u'すもももももももものうち'):
  print(token)

print('')
print(u'Tokenize (mmap system dictionary)')
t = Tokenizer(mmap=True)
for token in t.tokenize(u'すもももももももものうち'):
  print(token)

print('')
print(u'Tokenize (wakati mode)')
for token in t.tokenize(u'すもももももももものうち', wakati = True):
  print(token)

print('')
print(u'Tokenize with user dictionary')
t = Tokenizer("user_ipadic.csv", udic_enc="utf8")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'):
  print(token)

print('')
print(u'Tokenize with user dictionary (wakati mode)')
t = Tokenizer("user_ipadic.csv", udic_enc="utf8")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。', wakati = True):
  print(token)
Exemplo n.º 59
0
import sys 
from janome.tokenizer import Tokenizer

argvs = sys.argv
 
f = open(argvs[1])

line = f.readline() 

while line:

    line = f.readline()
    print line

    try:
        t = Tokenizer()
        tokens = t.tokenize(line.decode('utf-8'))

        for token in tokens:
            print(token)

    except:
        pass

f.close