def test_h2z(): assert_equal(jctconv.h2z('ティロフィナーレ'), 'ティロフィナーレ') assert_equal(jctconv.h2z('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ') _compare(jctconv.h2z, HALF_KANA, FULL_KANA) _compare(partial(jctconv.h2z, ascii=True), HALF_ASCII, FULL_ASCII) _compare(partial(jctconv.h2z, digit=True), HALF_DIGIT, FULL_DIGIT) assert_equal(jctconv.h2z(_concat(HALF_KANA, HALF_ASCII, HALF_DIGIT), ascii=True, digit=True, kana=True), _concat(FULL_KANA, FULL_ASCII, FULL_DIGIT))
def test_h2z(): assert_equal(jctconv.h2z('ティロフィナーレ'), 'ティロフィナーレ') assert_equal(jctconv.h2z('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ') _compare(jctconv.h2z, HALF_KANA, FULL_KANA) _compare(partial(jctconv.h2z, ascii=True), HALF_ASCII, FULL_ASCII) _compare(partial(jctconv.h2z, digit=True), HALF_DIGIT, FULL_DIGIT) for ascii in (True, False): for digit in (True, False): for kana in (True, False): assert_equal( jctconv.h2z(_concat(HALF_KANA if kana else FULL_KANA, HALF_ASCII if ascii else FULL_ASCII, HALF_DIGIT if digit else FULL_DIGIT), ascii=ascii, digit=digit, kana=kana), _concat(FULL_KANA, FULL_ASCII, FULL_DIGIT))
def hanKana2zenKana(strings): """ 半角カナを全角カナに変換する その他の文字はそのまま """ strings = MultiBytes.convert2unicode(strings) return jctconv.h2z(strings)
def prepare(self, text): text = normalize.shorten_repeat(text, 3) text = jctconv.h2z(text) text = re_a_tag.sub('', text) text = kigou.sub('', text) for (old, new) in self.paraphrases['before'].items(): text = text.replace(old, new) return text
def normalize_askfm(unicode_string, h2z): assert isinstance(unicode_string, unicode) s = unicode_string s = re.sub(r'\s+', ' ', s) s = normalize_neologd(s) if h2z: return jctconv.h2z(s, kana=True, ascii=True, digit=True) else: return s
def normalize(text, emoticon=False, repeat=None): text = HTMLParser().unescape(text) text = text.replace('\r', '\n') if emoticon is False: text = remove_useless_symbol(text) text = text.replace(u'γ⌒ヽ', '') text = jctconv.h2z(text) text = text.replace(u'よぉ', u'よ').replace(u'よぉ', u'よ') text = text.replace(u'よお', u'よ').replace(u'よお', u'よ') if repeat: text = shorten_repeat(text, repeat) return text
def normalize(string): string = convert_expr(string) keep = Keep(expressions_keep) string = keep.encode(string) string = preprocess(string) string = convert_digit(string) string = jctconv.normalize(string, 'NFKC') string = jctconv.h2z(string, digit=True, ascii=True) string = convert_two_digit(string) string = keep.restore(string) return string
def get_lines(self, xs, ys, sent_idx, doc_fn): """See base class.""" surf_seq, *rest = xs if sent_idx != self.sentence_id or doc_fn != self.file_name: # Check overlap position = (sent_idx, doc_fn) if position in self.set_known_position: raise RuntimeError("Can't covert because filename and sentence id is not sorted.") self.sentence_id = sent_idx self.file_name = doc_fn tokens = " ".join([jctconv.h2z(surf, digit=True, ascii=True) for surf in surf_seq]) return tokens
def mecab_maintenance(): ma_result = '' term_added_message = '' check_matrix_message = '' edit_matrix_message = '' search_result = '' updating_dic_now = is_updating_dic_now() if request.method == 'POST': if request.form.get('ma'): nbest = int(request.form.get('nbest')) mecab_arg = '-N %s' % nbest if nbest > 1 else '' tagger = MeCab.Tagger(DEFALUT_FORMAT + mecab_arg) text = jctconv.h2z(request.form.get('ma')) for line in text.splitlines(): ma_result += line + '\n' if nbest > 2: tagger.parseNBestInit(line) for i in range(nbest): result = tagger.next() if result: ma_result += result else: break else: ma_result += tagger.parse(line) elif request.form.get('add') and request.form.get('term'): input_pos = request.form.get('pos') term = request.form.get('term') lemma = request.form['lemma'] or term yomi = request.form['yomi'] or get_yomi(term) term_added_message = add_term(input_pos, term, lemma, yomi) elif request.form.get('del') and request.form.get('del_line'): delete_term(request.form.get('del_line')) elif all(request.form.get(v) for v in ('chk_matrix', 'lid', 'rid')): check_matrix_message = check_matrix(request.form['lid'], request.form['rid']) elif all(request.form.get(v) for v in ('matrix', 'lid', 'rid', 'cost')): edit_matrix_message = edit_matrix(request.form.get('lid'), request.form.get('rid'), request.form.get('cost')) elif request.form.get('search') and request.form.get('search_query'): search_result = search(request.form.get('search_query')) elif request.form.get('update'): update_dic() return render_template('mecab.html', ma_result=ma_result, term_added_message=term_added_message, check_matrix_message=check_matrix_message, edit_matrix_message=edit_matrix_message, search_result=search_result, updating_dic_now=updating_dic_now, PARTS_OF_SPEECH=PARTS_OF_SPEECH)
def main(file): result = '' name = file.split('\\')[-1] PATH = re.sub(name, '', file) name = name.replace(".docx", ".txt") folder = PATH + name.replace(".txt", "") + '_img' os.mkdir(folder) text = docx2txt.process(file, folder) text = jctconv.h2z(text, kana=False, digit=True, ascii=False) # 数字を全角に変換 result = anaryze(text) with open(PATH + 'new_' + name, mode='w') as f: f.write('[simpleruby]\n') f.write(result) f.write('\n[/simpleruby]')
MECAB_ARGS_KEYS = 'rdulDOapmMFUBESxbPCtco' if __name__ == '__main__': parser = argparse.ArgumentParser() for key in MECAB_ARGS_KEYS: parser.add_argument('-%s' % key) parser.add_argument('-N', type=int) args = parser.parse_args() mecab_arg = '' for key in MECAB_ARGS_KEYS: arg = getattr(args, key) if arg: mecab_arg += ' -%s%s' % (key, arg) if not args.F: mecab_arg += DEFALUT_FORMAT mecab = MeCab.Tagger(mecab_arg) while True: sentence = input() sentence = jctconv.h2z(sentence) if args.N: mecab.parseNBestInit(sentence) for i in range(args.N): result = mecab.next() if result: print(result) else: break else: result = mecab.parse(sentence) print(result)
sys.stderr.write('whit '+repr(lno)+": "+line.encode('utf-8')+"\n"); # normalize to null line = re.sub(x_null,r'',line) if verbose>1: sys.stderr.write('null '+repr(lno)+": "+line.encode('utf-8')+"\n"); if despacing: # e.g. use case: before japanese tokenization, remove suspect white space # note that wide comma normalization occurs here despace(line) if verbose>1: sys.stderr.write('despace '+repr(lno)+": "+line.encode('utf-8')+"\n"); if widecase: # wide-casing for specified classes line = h2z(line,kana=jctkana,digit=jctsym,ascii=jctalpha) if verbose>1: sys.stderr.write('wide '+repr(lno)+": "+line.encode('utf-8')+"\n"); if narrowcase: # narrow-casing for specified classes line = z2h(line,kana=jctkana,digit=jctsym,ascii=jctalpha) if verbose>1: sys.stderr.write('narrow '+repr(lno)+": "+line.encode('utf-8')+"\n"); # tokenize using pipe ptok.sendline(line.encode('utf-8')) if verbose>1: sys.stderr.write('reading...\n')
def test_h2z(): assert_equal(jctconv.h2z(u'ティロフィナーレ'), u'ティロフィナーレ') assert_equal(jctconv.h2z(HALF_KANA), FULL_KANA) assert_equal(jctconv.h2z(HALF_ASCII, mode='ASCII'), FULL_ASCII) assert_equal(jctconv.h2z(HALF_DIGIT, mode='DIGIT'), FULL_DIGIT)