예제 #1
0
def test_h2z():
    assert_equal(jctconv.h2z('ティロフィナーレ'), 'ティロフィナーレ')
    assert_equal(jctconv.h2z('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ')
    _compare(jctconv.h2z, HALF_KANA, FULL_KANA)
    _compare(partial(jctconv.h2z, ascii=True), HALF_ASCII, FULL_ASCII)
    _compare(partial(jctconv.h2z, digit=True), HALF_DIGIT, FULL_DIGIT)
    assert_equal(jctconv.h2z(_concat(HALF_KANA, HALF_ASCII, HALF_DIGIT),
                             ascii=True, digit=True, kana=True),
                 _concat(FULL_KANA, FULL_ASCII, FULL_DIGIT))
예제 #2
0
def test_h2z():
    assert_equal(jctconv.h2z('ティロフィナーレ'), 'ティロフィナーレ')
    assert_equal(jctconv.h2z('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ')
    _compare(jctconv.h2z, HALF_KANA, FULL_KANA)
    _compare(partial(jctconv.h2z, ascii=True), HALF_ASCII, FULL_ASCII)
    _compare(partial(jctconv.h2z, digit=True), HALF_DIGIT, FULL_DIGIT)

    for ascii in (True, False):
        for digit in (True, False):
            for kana in (True, False):
                assert_equal(
                    jctconv.h2z(_concat(HALF_KANA if kana else FULL_KANA,
                                        HALF_ASCII if ascii else FULL_ASCII,
                                        HALF_DIGIT if digit else FULL_DIGIT),
                                ascii=ascii, digit=digit, kana=kana),
                    _concat(FULL_KANA, FULL_ASCII, FULL_DIGIT))
예제 #3
0
    def hanKana2zenKana(strings):
        """
        半角カナを全角カナに変換する
        その他の文字はそのまま
        """

        strings = MultiBytes.convert2unicode(strings)
        return jctconv.h2z(strings)
예제 #4
0
 def prepare(self, text):
     text = normalize.shorten_repeat(text, 3)
     text = jctconv.h2z(text)
     text = re_a_tag.sub('', text)
     text = kigou.sub('', text)
     for (old, new) in self.paraphrases['before'].items():
         text = text.replace(old, new)
     return text
예제 #5
0
def normalize_askfm(unicode_string, h2z):
    assert isinstance(unicode_string, unicode)
    s = unicode_string
    s = re.sub(r'\s+', ' ', s)
    s = normalize_neologd(s)
    if h2z:
        return jctconv.h2z(s, kana=True, ascii=True, digit=True)
    else:
        return s
예제 #6
0
def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_useless_symbol(text)
        text = text.replace(u'γ⌒ヽ', '')
        text = jctconv.h2z(text)
        text = text.replace(u'よぉ', u'よ').replace(u'よぉ', u'よ')
        text = text.replace(u'よお', u'よ').replace(u'よお', u'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text
예제 #7
0
def normalize(string):
    string = convert_expr(string)

    keep = Keep(expressions_keep)
    string = keep.encode(string)
    string = preprocess(string)
    string = convert_digit(string)
    string = jctconv.normalize(string, 'NFKC')
    string = jctconv.h2z(string, digit=True, ascii=True)
    string = convert_two_digit(string)
    string = keep.restore(string)

    return string
예제 #8
0
    def get_lines(self, xs, ys, sent_idx, doc_fn):
        """See base class."""
        surf_seq, *rest = xs

        if sent_idx != self.sentence_id or doc_fn != self.file_name:
            # Check overlap
            position = (sent_idx, doc_fn)
            if position in self.set_known_position:
                raise RuntimeError("Can't covert because filename and sentence id is not sorted.")
            self.sentence_id = sent_idx
            self.file_name = doc_fn
            tokens = " ".join([jctconv.h2z(surf, digit=True, ascii=True) for surf in surf_seq])

            return tokens
예제 #9
0
def mecab_maintenance():
    ma_result = ''
    term_added_message = ''
    check_matrix_message = ''
    edit_matrix_message = ''
    search_result = ''
    updating_dic_now = is_updating_dic_now()

    if request.method == 'POST':
        if request.form.get('ma'):
            nbest = int(request.form.get('nbest'))
            mecab_arg = '-N %s' % nbest if nbest > 1 else ''
            tagger = MeCab.Tagger(DEFALUT_FORMAT + mecab_arg)
            text = jctconv.h2z(request.form.get('ma'))
            for line in text.splitlines():
                ma_result += line + '\n'
                if nbest > 2:
                    tagger.parseNBestInit(line)
                    for i in range(nbest):
                        result = tagger.next()
                        if result:
                            ma_result += result
                        else:
                            break
                else:
                    ma_result += tagger.parse(line)
        elif request.form.get('add') and request.form.get('term'):
            input_pos = request.form.get('pos')
            term = request.form.get('term')
            lemma = request.form['lemma'] or term
            yomi = request.form['yomi'] or get_yomi(term)
            term_added_message = add_term(input_pos, term, lemma, yomi)
        elif request.form.get('del') and request.form.get('del_line'):
            delete_term(request.form.get('del_line'))
        elif all(request.form.get(v) for v in ('chk_matrix', 'lid', 'rid')):
            check_matrix_message = check_matrix(request.form['lid'], request.form['rid'])
        elif all(request.form.get(v) for v in ('matrix', 'lid', 'rid', 'cost')):
            edit_matrix_message = edit_matrix(request.form.get('lid'), request.form.get('rid'),
                                              request.form.get('cost'))
        elif request.form.get('search') and request.form.get('search_query'):
            search_result = search(request.form.get('search_query'))
        elif request.form.get('update'):
            update_dic()
    return render_template('mecab.html', ma_result=ma_result,
                           term_added_message=term_added_message,
                           check_matrix_message=check_matrix_message,
                           edit_matrix_message=edit_matrix_message,
                           search_result=search_result,
                           updating_dic_now=updating_dic_now,
                           PARTS_OF_SPEECH=PARTS_OF_SPEECH)
예제 #10
0
def main(file):
    result = ''
    name = file.split('\\')[-1]
    PATH = re.sub(name, '', file)
    name = name.replace(".docx", ".txt")

    folder = PATH + name.replace(".txt", "") + '_img'
    os.mkdir(folder)
    text = docx2txt.process(file, folder)
    text = jctconv.h2z(text, kana=False, digit=True, ascii=False)  # 数字を全角に変換
    result = anaryze(text)

    with open(PATH + 'new_' + name, mode='w') as f:
        f.write('[simpleruby]\n')
        f.write(result)
        f.write('\n[/simpleruby]')
예제 #11
0
MECAB_ARGS_KEYS = 'rdulDOapmMFUBESxbPCtco'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    for key in MECAB_ARGS_KEYS:
        parser.add_argument('-%s' % key)
    parser.add_argument('-N', type=int)
    args = parser.parse_args()
    mecab_arg = ''
    for key in MECAB_ARGS_KEYS:
        arg = getattr(args, key)
        if arg:
            mecab_arg += ' -%s%s' % (key, arg)
    if not args.F:
        mecab_arg += DEFALUT_FORMAT
    mecab = MeCab.Tagger(mecab_arg)
    while True:
        sentence = input()
        sentence = jctconv.h2z(sentence)
        if args.N:
            mecab.parseNBestInit(sentence)
            for i in range(args.N):
                result = mecab.next()
                if result:
                    print(result)
                else:
                    break
        else:
            result = mecab.parse(sentence)
            print(result)
예제 #12
0
            sys.stderr.write('whit '+repr(lno)+": "+line.encode('utf-8')+"\n");
        # normalize to null
        line = re.sub(x_null,r'',line)
        if verbose>1:
            sys.stderr.write('null '+repr(lno)+": "+line.encode('utf-8')+"\n");

    if despacing:
        # e.g. use case: before japanese tokenization, remove suspect white space
        # note that wide comma normalization occurs here
        despace(line)
        if verbose>1:
            sys.stderr.write('despace '+repr(lno)+": "+line.encode('utf-8')+"\n");

    if widecase:
        # wide-casing for specified classes
        line = h2z(line,kana=jctkana,digit=jctsym,ascii=jctalpha)
        if verbose>1:
            sys.stderr.write('wide '+repr(lno)+": "+line.encode('utf-8')+"\n");

    if narrowcase:
        # narrow-casing for specified classes
        line = z2h(line,kana=jctkana,digit=jctsym,ascii=jctalpha)
        if verbose>1:
            sys.stderr.write('narrow '+repr(lno)+": "+line.encode('utf-8')+"\n");


    # tokenize using pipe
    
    ptok.sendline(line.encode('utf-8'))
    if verbose>1:
        sys.stderr.write('reading...\n')
예제 #13
0
def test_h2z():
    assert_equal(jctconv.h2z(u'ティロフィナーレ'), u'ティロフィナーレ')
    assert_equal(jctconv.h2z(HALF_KANA), FULL_KANA)
    assert_equal(jctconv.h2z(HALF_ASCII, mode='ASCII'), FULL_ASCII)
    assert_equal(jctconv.h2z(HALF_DIGIT, mode='DIGIT'), FULL_DIGIT)