예제 #1
0
def filter(x):
    x = gezi.filter_quota(x).replace('\r', '\x01').replace(
        '\n', '\x02').replace('<R>',
                              '\x01').replace('<N>',
                                              '\x02').replace('\t', ' ')
    # simplify seems not help but might help diversity
    if FLAGS.to_simplify:
        x = gezi.to_simplify(x)
    # TODO if needed try to find case usefull or not I think especally for sentiment not reading, lower is ok not loose important info like NIKE
    x = x.lower()
    return x
예제 #2
0
# ==============================================================================
#          \file   to-simplify.py
#        \author   chenghuige  
#          \date   2018-10-19 12:58:07.505225
#   \Description  
# ==============================================================================

  
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys 
import os

from tqdm import tqdm
import pandas as pd
import gezi
import traceback 
import json
import six  
#you may need to ln ~/soft/bseg/ data,conf,lib to current path and run in pyenv(python2)
assert six.PY2, 'must using py2 env to do simplify'
  
for line in open(sys.argv[1]):
  m = json.loads(line.rstrip('\n')) 
  m['passage'] = gezi.to_simplify(m['passage'])
  m['query'] = gezi.to_simplify(m['query'])
  m['alternatives'] = gezi.to_simplify(m['alternatives'])
  print(json.dumps(m, ensure_ascii=False).encode('utf8'))
예제 #3
0
assert six.PY2, 'must using py2 env to do simplify'

key = 'content'

if len(sys.argv) > 3:
    key = sys.argv[3]

df = pd.read_csv(sys.argv[1], lineterminator='\n')

contents = df[key].values

num_modified = 0
num_errs = 0
for i in tqdm(range(len(contents)), ascii=True):
    try:
        scontent = gezi.to_simplify(contents[i])
    except Exception:
        num_errs += 1
        print(traceback.format_exc())
        continue
    if scontent != contents[i]:
        # print('------------------', i)
        # print(contents[i])
        # print(scontent)
        contents[i] = scontent
        num_modified += 1

df[key] = contents

print('modify ratio', num_modified / len(df))
print('num_errs', num_errs)
예제 #4
0
flags = tf.app.flags
FLAGS = flags.FLAGS

import sys, os
import numpy as np
import gezi

import json
import traceback

START_WORD = '<S>'
END_WORD = '</S>'

num = 0
num_errs = 0
for line in sys.stdin:
    line = line.rstrip()
    # try:
    m = json.loads(line)
    m['question'] = gezi.to_simplify(m['question'])
    for i in range(len(m['answers'])):
        m['answers'][i] = gezi.to_simplify(m['answers'][i])
    for i in range(len(m['documents'])):
        m['documents'][i]['title'] = gezi.to_simplify(
            m['documents'][i]['title'])
        for j in range(len(m['documents'][i]['paragraphs'])):
            m['documents'][i]['paragraphs'][j] = gezi.to_simplify(
                m['documents'][i]['paragraphs'][j])
    print(json.dumps(m, ensure_ascii=False).encode('utf8'))