def filter(x): x = gezi.filter_quota(x).replace('\r', '\x01').replace( '\n', '\x02').replace('<R>', '\x01').replace('<N>', '\x02').replace('\t', ' ') # simplify seems not help but might help diversity if FLAGS.to_simplify: x = gezi.to_simplify(x) # TODO if needed try to find case usefull or not I think especally for sentiment not reading, lower is ok not loose important info like NIKE x = x.lower() return x
# ============================================================================== # \file to-simplify.py # \author chenghuige # \date 2018-10-19 12:58:07.505225 # \Description # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os from tqdm import tqdm import pandas as pd import gezi import traceback import json import six #you may need to ln ~/soft/bseg/ data,conf,lib to current path and run in pyenv(python2) assert six.PY2, 'must using py2 env to do simplify' for line in open(sys.argv[1]): m = json.loads(line.rstrip('\n')) m['passage'] = gezi.to_simplify(m['passage']) m['query'] = gezi.to_simplify(m['query']) m['alternatives'] = gezi.to_simplify(m['alternatives']) print(json.dumps(m, ensure_ascii=False).encode('utf8'))
assert six.PY2, 'must using py2 env to do simplify' key = 'content' if len(sys.argv) > 3: key = sys.argv[3] df = pd.read_csv(sys.argv[1], lineterminator='\n') contents = df[key].values num_modified = 0 num_errs = 0 for i in tqdm(range(len(contents)), ascii=True): try: scontent = gezi.to_simplify(contents[i]) except Exception: num_errs += 1 print(traceback.format_exc()) continue if scontent != contents[i]: # print('------------------', i) # print(contents[i]) # print(scontent) contents[i] = scontent num_modified += 1 df[key] = contents print('modify ratio', num_modified / len(df)) print('num_errs', num_errs)
flags = tf.app.flags FLAGS = flags.FLAGS import sys, os import numpy as np import gezi import json import traceback START_WORD = '<S>' END_WORD = '</S>' num = 0 num_errs = 0 for line in sys.stdin: line = line.rstrip() # try: m = json.loads(line) m['question'] = gezi.to_simplify(m['question']) for i in range(len(m['answers'])): m['answers'][i] = gezi.to_simplify(m['answers'][i]) for i in range(len(m['documents'])): m['documents'][i]['title'] = gezi.to_simplify( m['documents'][i]['title']) for j in range(len(m['documents'][i]['paragraphs'])): m['documents'][i]['paragraphs'][j] = gezi.to_simplify( m['documents'][i]['paragraphs'][j]) print(json.dumps(m, ensure_ascii=False).encode('utf8'))