Пример #1
0
def main():
    args = get_args()
    vocab = Vocab(args.vocab_path, args.vocab_size)  # create a vocabulary
    hps = get_hps()
    if not args.data_path == "":
        batcher = Batcher(args.data_path, vocab, hps, args.single_pass)
        import pdb
        pdb.set_trace()
        x = batcher.next_batch()
        import pdb
        pdb.set_trace()
        pass
    else:
        with open(args.json_path) as f:
            art = json.load(f)
        article = neologdn.normalize(art['body'])
        abstract = neologdn.normalize(art['title'])
        m = MeCab('-Owakati')
        parsed_article = m.parse(article)
        abs_words = m.parse(abstract).split()
        ex = B.Example(parsed_article, abs_words, vocab, hps)
        b = B.Batch([ex], hps, vocab)
        import pdb
        pdb.set_trace()
        pass
Пример #2
0
def json_batch(fname, hps, vocab):
    with open(fname) as f:
        art = json.load(f)
    article = neologdn.normalize(art['body'])
    abstract = neologdn.normalize(art['title'])
    m = MeCab('-Owakati')
    parsed_article = m.parse(article)
    abs_words = m.parse(abstract).split()
    ex = B.Example(parsed_article, abs_words, vocab, hps)
    b = B.Batch([ex], hps, vocab)
    return b
Пример #3
0
class MecabTagger(object):
    """docstring, MecabTagger"""
    # TAGSET = set("""NNG NNP NNB NNBC NR NP VV VA VX VCP VCN MM MAG MAJ IC
    #                 JKS JKC JKG JKO JKB JKV JKQ JX JC EP EF EC ETN ETM
    #                 XPN XSN XSV XSA XR SF SE SSO SSC SC SY SL SH SN
    #                 UNKNOWN EOS""".split())

    def __init__(self, **kwargs):
        self.tagger = MeCab(kwargs)

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        del self.tagger

    @staticmethod
    def tagged_tuple(node):
        surface = node.surface
        features = node.feature.split(',')
        first_pos = features[0].partition('+')[0]
        lemma = (features[7].partition('/')[0]
                    if features[4].startswith('Inflect')
                    else surface.lower())
        return Word(decode(surface, True), decode(lemma, True),
                    first_pos.decode('ascii'), node.cost)

    def parse(self, text):  # follow NLTK naming
        return [MecabTagger.tagged_tuple(node)
                    for node in self.tagger.parse(text.encode(settings.DEFAULT_ENCODING), as_nodes=True)
                        if not node.is_eos()]
class KoreanTokenizer(DummyTokenizer):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        MeCab = try_mecab_import()
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")

    def __del__(self):
        self.mecab_tokenizer.__del__()

    def __call__(self, text):
        dtokens = list(self.detailed_tokens(text))
        surfaces = [dt["surface"] for dt in dtokens]
        doc = Doc(self.vocab,
                  words=surfaces,
                  spaces=list(check_spaces(text, surfaces)))
        for token, dtoken in zip(doc, dtokens):
            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
            token.lemma_ = dtoken["lemma"]
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc

    def detailed_tokens(self, text):
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
            if node.is_eos():
                break
            surface = node.surface
            feature = node.feature
            tag, _, expr = feature.partition(",")
            lemma, _, remainder = expr.partition("/")
            if lemma == "*":
                lemma = surface
            yield {"surface": surface, "lemma": lemma, "tag": tag}
Пример #5
0
def parse2df(text, sysdic="/usr/local/lib/mecab/dic/mecab-ipadic-neologd"):
    df = pd.DataFrame(
        index=[],
        columns=['文番号', '表層', '品詞1', '品詞2', '品詞3', '品詞4', '原型', 'posID'])

    text = text.split("\n")  #改行で分割して配列にする
    while '' in text:  #空行は削除
        text.remove('')

    parser = MeCab("-d " + sysdic)

    for index, sentence in enumerate(text):
        nodes = parser.parse(sentence, as_nodes=True)
        for node in nodes:
            if not node.is_eos():
                #品詞情報を分割
                feature = node.feature.split(',')
                #dataframeに追加
                series = pd.Series(
                    [
                        index,  #文番号
                        node.surface,  #表層
                        feature[0],  #品詞1
                        feature[1],  #品詞2     
                        feature[2],  #品詞3
                        feature[3],  #品詞4
                        feature[6],  #原型
                        node.posid  #品詞番号
                    ],
                    index=df.columns)
                df = df.append(series, ignore_index=True)
    return df
Пример #6
0
def text_segmentation_and_pronunciation(sender, instance, *args, **kwargs):
    if instance.language == 'zh-hans' or instance.language == 'zh-hant':

        # seperate Chinese words by spaces
        import jieba

        # from config.settings.base import JIEBA_DICT_PATH
        # jieba.set_dictionary(JIEBA_DICT_PATH)

        def cut(s):
            return jieba.cut(s, cut_all=False)

        seg_list = cut(instance.body)
        instance.body = " ".join(seg_list)

        # # generate Pinyin
        # import ChineseTone as ct
        #
        # pinyin_list = ct.PinyinHelper.convertToPinyinFromSentence(instance.body, pinyinFormat=ct.PinyinFormat.WITH_TONE_MARK, segment=cut)
        #
        # instance.pronunciation = ' '.join(pinyin_list)
        instance.save()

    elif instance.language == 'ja':

        # seperate Japanese words by spaces
        from natto import MeCab
        nm = MeCab('-Owakati')

        output = nm.parse(instance.body)
        instance.body = output
        instance.save()
Пример #7
0
def nlp(data):
    points = 0
    nm = MeCab()
    negaposi_dic = getNegaPosiDic()
    sentenses = re.split('[。!!♪♫★☆>??()w]', data)
    try:
        for sentense in sentenses:
            negaposi = 0
            result_all = nm.parse(sentense)
            result_words = result_all.split('\n')[:-1]

            for word in result_words:
                try:
                    word_toarray = re.split('[\t,]', word)
                    if word_toarray[7] in negaposi_dic:
                        negaposi = int(negaposi_dic[word_toarray[7]])
                        print(word_toarray[7], negaposi_dic[word_toarray[7]], \
                            flush=True)
                except Exception as e:
                    print('%r' % e, flush=True)
            points += negaposi

    except Exception as e:
        print('%r' % e, flush=True)
        print(data, flush=True)

    return points
Пример #8
0
def nlp(data):
    nm = MeCab()  # nmというMeCabクラスのインスタンスを作成
    points = 0  # 文章全体の評価
    negaposi_dic = getNegaPosiDic()  # 評価データの読み込み(さっき作った関数を呼び出している。)
    sentenses = re.split("[。!!♪♫★☆>??()w]", data)  # 一文ごとに分ける
    try:
        for sentense in sentenses:  # 文の数だけ繰り返す
            negaposi = 0
            result_all = nm.parse(sentense)  # 形態素解析して品詞分解をしている。
            result_words = result_all.split("\n")[:-1]  # 単語ごとに分ける
            for word in result_words:
                try:
                    word_toarray = re.split('[\t,]', word)
                    if word_toarray[7] in negaposi_dic:
                        negaposi = int(
                            negaposi_dic[word_toarray[7]])  # その文のネガポジ
                        print(word_toarray[7],
                              negaposi_dic[word_toarray[7]],
                              flush=True)  # 評価リストに入っていたワードとその評価
                except Exception as e:
                    print('%r' % e, flush=True)
            points += negaposi  # 文章全体の評価に加算
    except Exception as e:
        print('%r' % e, flush=True)
        print(data, flush=True)
    return points  # 文章全体の値を返す。
Пример #9
0
def mecab_analysis(text):
    import os
    mecab_flags = [
        f'-d {os.popen("mecab-config --dicdir").read().strip()}/mecab-ipadic-neologd/',
        '-u username.dic',
    ]
    t = MeCab(' '.join(mecab_flags))
    enc_text = text.strip() # MeCabに渡した文字列は必ず変数に入れておく https://shogo82148.github.io/blog/2012/12/15/mecab-python/
    t.parse('') # UnicodeDecodeError対策 http://taka-say.hateblo.jp/entry/2015/06/24/183748 
    # node = t.parseToNode(enc_text)
    output = []
    for node in t.parse(enc_text, as_nodes=True):
        if node.surface != "":  # ヘッダとフッタを除外
            word_type = node.feature.split(",")[0]
            if word_type in ["形容詞", "名詞", "副詞"]:
                output.append(node.surface)
    return output
Пример #10
0
def main_mecab():
    mc = MeCab()
    with open('./data/neko.txt', 'r') as f:
        with open(save_path, 'w') as fw:
            for line in f:
                for one_sent in line.strip().split():
                    # 1行に2 sentence 以上ある場合
                    fw.write(mc.parse(one_sent.strip()) + "\n")
    print("save mecab file {}".format(save_path))
Пример #11
0
   def keitaiso_kaiseki(self,sentence):
        nm = MeCab()
        terms = []
        for node in nm.parse(sentence, as_nodes=True):
            list = node.feature.split(',')
            if list[0] == '名詞'or list[0] == '形容詞':
                terms.append(node.surface)

        return terms
Пример #12
0
def word_frequencies(text):
    from manabi.apps.reading_level.word_frequencies import WORD_FREQUENCIES

    mecab = MeCab()
    frequencies = []
    for node in mecab.parse(text.encode('utf8'), as_nodes=True):
        frequency = WORD_FREQUENCIES.get(node.surface.decode('utf8'))
        if frequency is None:
            continue
        frequencies.append(frequency)
    return frequencies
Пример #13
0
class MeCabParser(object):
    def __init__(self):
        if os.name == 'nt':
            self.engine = RemoteMecabParser()
        else:
            self.engine = MeCab()

    def parse(self, text):
        masked_text, urls = self.url_mask(text)
        masked_text, figures = self.figure_mask(masked_text)
        masked_text, digits = self.digit_mask(masked_text)

        mc_ret = self.engine.parse(masked_text)
        mc_lines = mc_ret.split('\n')
        dic = {}
        for line in mc_lines:
            s = line.split('\t')
            if len(s) >= 2:
                buf = s[1].split(',')
                meta = {'part1': buf[0], 'part2': buf[1], 'part3': buf[2]}
                if len(buf) > 9:
                    meta['add1'] = buf[9]
                key = s[0]
                if key == 'PACMECABURL':
                    key = urls.pop(0)
                if key == 'PACMECABFIGURE':
                    key = figures.pop(0)
                if key == 'PACMECABDIGIT':
                    key = digits.pop(0)
                dic[key] = meta  # 同じwordはキーとしてdistinctされる

        return dic

    def url_mask(self, text):
        match = re.findall(PTN_URL, text)
        for m in match:
            text = text.replace(m, 'PACMECABURL')

        return text, match

    def figure_mask(self, text):
        match_list = []
        for ptn in PTN_FIGURE:
            match = re.findall(ptn, text)
            for m in match:
                text = text.replace(m, 'PACMECABFIGURE')
            match_list += match
        return text, match_list

    def digit_mask(self, text):
        match = re.findall('\d{5,}', text)
        for m in match:
            text = text.replace(m, 'PACMECABDIGIT')
        return text, match
Пример #14
0
def word_frequencies(text):
    from manabi.apps.reading_level.word_frequencies import WORD_FREQUENCIES

    mecab = MeCab()
    frequencies = []
    for node in mecab.parse(text.encode('utf8'), as_nodes=True):
        frequency = WORD_FREQUENCIES.get(node.surface.decode('utf8'))
        if frequency is None:
            continue
        frequencies.append(frequency)
    return frequencies
Пример #15
0
def parse_word_list(text):
    """
    文章を単語のリストに変換して返す
    """
    words = list()
    nm = MeCab()
    with MeCab('-F%m,%f[0]') as nm:
        for n in nm.parse(text, as_nodes=True):
            node = n.feature.split(',');
            if node[0] != 'EOS' and is_valid_speech(node[1]):
                words.append(node[0])

    return words
Пример #16
0
class MeCabTokenizer:
    DEFAULT_DICTIONARY = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd"

    def __init__(self, dic=DEFAULT_DICTIONARY):
        self._mecab = MeCab(f"-d {dic} -F%f[0],%f[1],%f[2],%f[3],%f[6]")
        self._tokenizer = tf.keras.preprocessing.text.Tokenizer()

    def tokenize(self, text):
        tokens = []

        for node in self._mecab.parse(text, as_nodes=True):
            if node.is_eos():
                continue

            feature = node.feature.split(",")
            part_of_speech, lemma = feature[0:4], feature[4]

            if part_of_speech[0] not in ["名詞", "動詞", "形容詞"]:
                continue

            if part_of_speech[0:2] == ["名詞", "数"]:
                continue

            tokens.append(lemma)

        return " ".join(tokens)

    def fit_on_texts(self, texts):
        texts = [
            self.tokenize(text)
            for text in progress.track(texts,
                                       description="Fitting on texts...")
        ]
        self._tokenizer.fit_on_texts(texts)
        return self._tokenizer.texts_to_sequences(texts)

    def texts_to_matrix(self, texts):
        texts = [self.tokenize(text) for text in texts]
        return self._tokenizer.texts_to_matrix(texts, mode="tfidf")

    def sequences_to_matrix(self, sequences):
        return self._tokenizer.sequences_to_matrix(sequences, mode="tfidf")

    def save(self, path):
        with open(path, "w") as f:
            f.write(self._tokenizer.to_json())

    def load(self, path):
        with open(path) as f:
            self._tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(
                f.read())
Пример #17
0
    def from_sentence(klass, sentence: str, mecab: MeCab) -> 'Iterable[Word]':
        normalized = jaconv.normalize(sentence)
        for mec_node in mecab.parse(normalized, as_nodes=True):
            if mec_node.is_eos():
                break

            res = AnalyzeMorp(mec_node)
            if res.is_symbol():
                continue

            if TagWord.is_include(res.surface()):
                yield TagWord(res.surface())
            else:
                yield Word(surface=res.surface(), yomi=res.yomi())
Пример #18
0
def txt2words(txt) -> list:
    posid = [
        36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 66, 67, 2, 31,
        36, 10, 34
    ]
    words = []
    parser = MeCab()
    nodes = parser.parse(txt, as_nodes=True)
    for node in nodes:
        if not node.is_eos():
            feature = node.feature.split(',')
            if node.posid in posid and feature[6] != "*":
                words.append(feature[6])
    return words
Пример #19
0
class MeCabTokenizer(BaseTokenizer):
    def __init__(
        self,
        user_dictionary_path: Optional[str] = None,
        system_dictionary_path: Optional[str] = None,
        dictionary_format: Optional[str] = None,
    ) -> None:
        from natto import MeCab
        super().__init__(name="mecab")
        options = []
        if isinstance(user_dictionary_path, str):
            options.append("-u {}".format(user_dictionary_path))
        if isinstance(system_dictionary_path, str):
            options.append("-d {}".format(system_dictionary_path))
        self._tokenizer = MeCab(" ".join(options))

        # If dictionary format is not specified,
        # konoha detects it by checking a name of system dictionary.
        # For instance, system_dictionary_path=mecab-ipadic-xxxx -> ipadic and
        #               system_dictionary_path=mecab-unidic-xxxx -> unidic.
        # If system_dictionary_path and dictionary_format are not given,
        # konoha assumes it uses mecab-ipadic (de facto standard).
        # Currently, konoha only supports ipadic. (TODO: unidic)
        if dictionary_format is None:
            if system_dictionary_path is None or "ipadic" in system_dictionary_path.lower(
            ):
                self._parse_feature = parse_feature_for_ipadic
            elif "unidic" in system_dictionary_path.lower():
                self._parse_feature = parse_feature_for_unidic
            else:
                raise ValueError(
                    f"Unsupported system dictionary: {system_dictionary_path}")

        else:
            if "ipadic" == dictionary_format.lower():
                self._parse_feature = parse_feature_for_ipadic
            elif "unidic" == dictionary_format.lower():
                self._parse_feature = parse_feature_for_unidic
            else:
                raise ValueError(
                    f"Unsupported dictionary format: {dictionary_format}")

    def tokenize(self, text: str) -> List[Token]:
        return_result = []
        parse_result = self._tokenizer.parse(text).rstrip(" ")
        for elem in parse_result.split("\n")[:-1]:
            return_result.append(self._parse_feature(elem))
        return return_result
Пример #20
0
def get_tokens(text):
    mecab = MeCab()
    tokens = []
    pos_word_dict = {}
    for t in text:
        res_raw = mecab.parse(t.encode('utf-8'))
        res = [r.split('\t') for r in res_raw.split('\n')]
        res = [r for r in res if len(r) == 2]
        res = [[r[0], r[1].split(',')[0]] for r in res]

        for r in res:
            if r[1] in pos_word_dict:
                pos_word_dict[r[1]].append(r[0])
            else:
                pos_word_dict[r[1]] = [r[0]]
        tokens.append(' '.join([r[0] for r in res]))
    return tokens, pos_word_dict
Пример #21
0
def main(text):
    """
    MeCabで分かち書きした後に作成したモデルを読み込み、判定
    MeCabのneologdの保存されているpathはmacなら大抵ここになるはずではあるが、エラーが出た際は調べて修正してください。
    """
    nm = MeCab("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    words = nm.parse(text)
    print('\n', words)

    classifier = ft.load_model('./model.bin')
    estimate = classifier.predict([words], k=2)
    estimate_2 = classifier.predict_proba([words], k=2)
    print('estimate:', estimate_2[0][0][1])
    if estimate[0][0] == '__label__2,':
        return ['ネガティブ', str(estimate_2[0][0][1])]
    elif estimate[0][0] == '__label__1,':
        return ['ポジティブ', str(estimate_2[0][0][1])]
Пример #22
0
def split_into_words(doc, name=''):
    #    mecab = MeCab.Tagger("-Ochasen")
    #形態素解析
    mecab = MeCab("-Ochasen")
    #作品部分だけ抽出
    valid_doc = trim_doc(doc)
    #単語ごとに分割(linesはlist)
    lines = mecab.parse(doc).splitlines()
    words = []
    for line in lines:
        #水平タブごとに分割
        chunks = line.split('\t')
        if len(chunks) > 3 and (
                chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or
            (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))):
            #要は単語を抽出している
            words.append(chunks[0])
    return LabeledSentence(words=words, tags=[name])
def parse2df(text, sysdic="/usr/local/lib/mecab/dic/naist-jdic"):
    """文毎に形態素解析を行い、結果をdataframeに格納して返す
    Args:
      text:形態素解析対象のテキスト
    Returns:
      形態素解析結果を格納したdataframe
      カラムは['文番号','表層', '品詞1','品詞2','品詞3','品詞4','原型','posID']
    """
    # 結果格納用の空のDataFrame
    df = pd.DataFrame(
        index=[],
        columns=['文番号', '表層', '品詞1', '品詞2', '品詞3', '品詞4', '原型', 'posID'])
    text = re.sub(r".", "。\n", text)
    text = re.sub(r"。", "。\n", text)
    text = text.split("\n")  # 改行で分割して配列にする
    while '' in text:  # 空行は削除
        text.remove('')

    parser = MeCab("-d " + sysdic)

    for index, sentence in enumerate(text):
        logging.debug(sentence)
        nodes = parser.parse(sentence, as_nodes=True)
        for node in nodes:
            if not node.is_eos():
                # 品詞情報を分割
                feature = node.feature.split(',')
                # dataframeに追加
                series = pd.Series(
                    [
                        index,  # 文番号
                        node.surface,  # 表層
                        feature[0],  # 品詞1
                        feature[1],  # 品詞2
                        feature[2],  # 品詞3
                        feature[3],  # 品詞4
                        feature[6],  # 原型
                        node.posid  # 品詞番号
                    ],
                    index=df.columns)
                df = df.append(series, ignore_index=True)
    logging.debug("End : parse2df")
    return df
Пример #24
0
def process(x):
    tokenized_text = []
    token_count = 0
    no_parsing = get_no_parsing()
    known_words = get_knowledge("moi")
    words = {}
    nm = MeCab("-Owakati")
    for line in x.readlines():
        for n in nm.parse(line, as_nodes=True):
            tokenized_text.append(n.surface)
            if n.surface not in known_words and n.surface not in no_parsing:
                words[n.surface] = 0
            elif n.surface in known_words:
                words[n.surface] = known_words[n.surface]
            token_count += 1
        tokenized_text.append(u'<br>')
    return {'tokenized_text': tokenized_text,
            'words': words,
            'token_count': str(token_count)}
Пример #25
0
class KoreanTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab):
        self.vocab = vocab
        MeCab = try_mecab_import()  # type: ignore[func-returns-value]
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")

    def __reduce__(self):
        return KoreanTokenizer, (self.vocab,)

    def __del__(self):
        self.mecab_tokenizer.__del__()

    def __call__(self, text: str) -> Doc:
        dtokens = list(self.detailed_tokens(text))
        surfaces = [dt["surface"] for dt in dtokens]
        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
        for token, dtoken in zip(doc, dtokens):
            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
            token.pos = TAG_MAP[token.tag_][POS]
            token.lemma_ = dtoken["lemma"]
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc

    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
            if node.is_eos():
                break
            surface = node.surface
            feature = node.feature
            tag, _, expr = feature.partition(",")
            lemma, _, remainder = expr.partition("/")
            if lemma == "*":
                lemma = surface
            yield {"surface": surface, "lemma": lemma, "tag": tag}

    def score(self, examples):
        validate_examples(examples, "KoreanTokenizer.score")
        return Scorer.score_tokenization(examples)
Пример #26
0
def run_ma(text, stop_path='', nBest=1):
    """
    Returns the dataframe of all Information of morpheme analyzer.
    - input : string, {stopword file path}, {nbest number}
    - output : dataframe
    """
    options = r'-F%m,%f[0],%f[1],%f[2],%f[3],%f[4],%f[5],%f[6],%f[7]\n'
    options += " -N" + str(nBest)

    stopword_flag = False

    if stop_path != '':
        stopword_flag = True
    try:
        _me = MeCab(options)

        _df = pd.DataFrame(None,
                           columns=[
                               'surface', 'tag', 'meaning_class',
                               'final_consonant', 'reading', 'type',
                               'first_tag', 'final_tag', 'expression'
                           ])

        if stopword_flag:
            trie = load_stopword(stop_path)

        i = 0
        for term_str in str(_me.parse(text)).split('\n'):
            term_list = re.split(',', term_str)

            if stopword_flag == True and is_stopword(term_list[0], trie):
                continue
            if len(term_list) < 2:
                continue

            _df.loc[i] = term_list
            i += 1
    except Exception as e:
        print("[run_ma] messages of error : ", e)

    return _me, _df
Пример #27
0
def parseText(text, sysdic='/usr/local/lib/mecab/dic/mecab-ipadic-neologd'):
    text = text.split("\n")  #改行で分割して配列にする
    while '' in text:  #空行は削除
        text.remove('')

    parser = MeCab("-d " + sysdic)
    lst = []

    for sentence in text:
        logging.debug(sentence)
        nodes = parser.parse(sentence, as_nodes=True)

        for node in nodes:
            features = node.feature.split(',')
            parts = features[0]
            if parts == '名詞':
                lst.append(node.surface)
            if parts in {'動詞', '形容詞', '形容動詞'}:
                lst.append(features[6])

    return lst
Пример #28
0
class Parser(object):
    def __init__(self):
        self.mc = MeCab()

    def parse_sentence(self, line):
        '''
        We receive a line of japanese text, pass it through Mecab morphological
        analyzer and include furigana when needed (only for kanjis).
        '''
        new_line = ''

        for node in self.mc.parse(line, as_nodes=True):
            if not node.is_eos():
                word = Word(node)

                if word.kanji:
                    new_line += '{}[{}]'.format(word.content, word.furigana)
                else:
                    new_line += word.content

        return new_line
Пример #29
0
    def dissasembly(self, tweet_data: pd.DataFrame):
        nm = MeCab('-Owakati')
        first_word = []
        word_ls = []

        for t in range(len(tweet_data)):
            content = tweet_data.values[t].item()
            if '@' in content or '時報' in content or 'http' in content:
                pass
            else:
                parsed_content = nm.parse(content)
                ls = list(parsed_content.split())

                first_word.append(ls[0] + ',')
                word_ls.append(ls[1:])

        df_f = pd.DataFrame(first_word)
        df_w = pd.DataFrame(word_ls)

        df_f.to_csv(path['first_word'], index=False)
        df_w.to_csv(path['words'], index=False)
Пример #30
0
class mecab_owakatikun(Twitter_syusyukun, MeCab):
    def owakatikun(self):
        self.nm = MeCab('-Owakati')
        self.result = ''
        self.syusyu(auth)
        self.tweet_ls = self.nm.parse(str(self.df.values))
        i = len(self.tweet_ls)
        for h in range(i):
            if '@' in str(self.tweet_ls[h]):
                h += 1
            elif '時報' in str(self.tweet_ls[h]):
                h += 1
            elif 'RT' in str(self.tweet_ls[h]):
                h += 1
            else:
                self.result += self.tweet_ls[h]
                h += 1
        self.write_txt = ''.join(self.result)
        with open('/mnt/c/users/user/awesome/my_ai/tweets.txt', 'a') as f:
            f.write(str(self.write_txt) + '\n')
            f.close
        print(self.write_txt)
Пример #31
0
def main():
    nm = MeCab('-Owakati')
    word = "MeCabは 京都大学情報学研究科−日本電信電話株式会社コミュニケーション科学基礎研究所 共同研究ユニットプロジェクトを通じて開発されたオープンソース 形態素解析エンジンです。 言語, 辞書,コーパスに依存しない汎用的な設計を 基本方針としています。 パラメータの推定に Conditional Random Fields (CRF) を用 いており, ChaSenが採用している 隠れマルコフモデルに比べ性能が向上しています。また、平均的に ChaSen, Juman, KAKASIより高速に動作します。 ちなみに和布蕪(めかぶ)は, 作者の好物です。"
    print(nm.parse(word))
    lis = [n.surface for n in nm.parse(word, as_nodes=True) if n.is_nor()]
    print(lis)
Пример #32
0
	nm = MeCab()
	for doc_id, syubunPart in rows:
		print "--------------"
		print "id:", doc_id
	# 改行、空白削除
		syubunPart = re.sub(r'(\n|\t| | )', '', syubunPart)
	# 文分割
		sensp = sensplit.SenSplit(syubunPart)
		syubun_list = sensp()
		
		for sentence in syubun_list:
			if sentence == '':
				continue
			morph_list = []		# 文を形態素で分割したリスト
			sentence = sentence.encode('utf_8')	# unicode→str(utf-8)
			for n in nm.parse(sentence, as_nodes=True):
				if not n.is_eos():
#					print n.surface
					morph_list.append(n.surface)
			x = []
			y = []
			for i in range(0, len(morph_list)):
				if i == 0:
					x.append('<BOS>')
					y.append(morph_list[i])
				elif  i == len(morph_list)-1:
					x.append(morph_list[i])
					y.append('<EOS>')
				else:
					x.append(morph_list[i])
					y.append(morph_list[i+1])
Пример #33
0
from gensim import corpora, matutils

mc = MeCab()
txt_word_list = []

# テキストファイルを格納しているフォルダを読み込み
files = os.listdir(os.path.dirname(__file__)+'/path/txt')

# フォルダ配下のテキストファイルを1つずつ読み込み
for file in files:

  # テキストファイルから名詞と動詞の単語を取り出したリスト作成(Q11-1の処理と同じ)
  with open(os.path.dirname(__file__) + '/path/txt/'+file, 'r') as f:
    txt = f.read()
    word_list = []
    for n in mc.parse(txt, as_nodes=True):
      if not (n.is_bos() or n.is_eos()):
        part, word = n.feature.split(',', 1)
      if part == "名詞" or part == "動詞":
        word_list.append(n.surface)

  # テキストファイルごとの単語リストを追加
  txt_word_list.append(word_list)

# bug of wordsを作成するため全種類の単語を把握し、単語IDを付与した辞書を作成
corpus_dic = corpora.Dictionary(txt_word_list)

# 各文章の単語リストをコーパス(辞書の単語IDと単語の出現回数)リストに変換
corpus_list = [corpus_dic.doc2bow(word_in_text) for word_in_text in txt_word_list]

# コーパスリストをスパースマトリックス(csc型)に変換
Пример #34
0
# MeCabをPythonで利用するためのライブラリ読み込み
import os
from natto import MeCab

# merosには、メロスの文章データが格納
# MeCabを実行するオブジェクトを生成
mc = MeCab()

# 下記のコードはテキスト時は、下記のようにする
with open(os.path.dirname(__file__) + '/path/txt/meros.txt', 'r') as f:
  txt = f.read()

word_list = []
# MeCabを用いて、形態素解析を実行
for part_and_word in mc.parse(txt, as_nodes=True):

  # 形態素解析結果のpart_and_wordが開始/終了オブジェクトでないことを判定
  if not (part_and_word.is_bos() or part_and_word.is_eos()):

    # 形態素解析結果から品詞と単語を取得
    part, word = part_and_word.feature.split(',', 1)

    # 名詞と動詞の単語を抽出
    if part == '名詞' or part == '動詞':
      word_list.append(part_and_word.surface)