def test_sublexicalize(self): result = sublexicalize("abc def ghi", order=3) self.assertEquals(result, "abc bc_ c_d _de def ef_ f_g _gh ghi") result = sublexicalize("abc def ghi", order=4) self.assertEquals(result, "abc_ bc_d c_de _def def_ ef_g f_gh _ghi") result = sublexicalize("abc def ghi", order=(3, 4)) self.assertEquals(result, "abc bc_ abc_ c_d bc_d _de c_de def _def ef_ def_ f_g ef_g _gh f_gh ghi _ghi")
def test_sublexicalize(self): result = sublexicalize("abc def ghi", order=3) self.assertEquals(result, "abc bc_ c_d _de def ef_ f_g _gh ghi") result = sublexicalize("abc def ghi", order=4) self.assertEquals(result, "abc_ bc_d c_de _def def_ ef_g f_gh _ghi") result = sublexicalize("abc def ghi", order=(3, 4)) self.assertEquals( result, "abc bc_ abc_ c_d bc_d _de c_de def _def ef_ def_ f_g ef_g _gh f_gh ghi _ghi" )
def process(args): text, clean_func, order = args text = ' '.join(text) if clean_func: text = clean_func(text) return sublexicalize(text, order=order, join=False)
def main(): parser = ArgumentParser() parser.add_argument('-e', '--encoding') parser.add_argument('-o', '--order', default="3") args = parser.parse_args() encoding = args.encoding order = parse_ngram_order(args.order) if encoding: sys.stdout=codecs.getwriter(encoding)(sys.stdout) sys.stdin=codecs.getreader(encoding)(sys.stdin) for text in sys.stdin: text = sublexicalize(text, order=order) sys.stdout.write(text) sys.stdout.write('\n')
def clean_c6(text_str): return sublexicalize(mahoney_clean(text_str), order=6)
from optparse import OptionParser import os import re import sys cur_path, _ = os.path.split(__file__) sys.path.append(os.path.join(cur_path, '..', 'Experiments')) from experiment_support.preprocessing import sublexicalize BUF_SIZE = 8192 if __name__ == '__main__': parser = OptionParser() parser.add_option("-n", "--ngram-order", default=3) opts, args = parser.parse_args() order = int(opts.ngram_order) in_str = sys.stdin.read(BUF_SIZE) rest_str = "" while len(in_str) > 0: out_str = sublexicalize(rest_str + in_str.rstrip('\n'), order=order) rest_str = re.sub('_', ' ', out_str[-(order - 1):]) sys.stdout.write(out_str + " ") in_str = sys.stdin.read(BUF_SIZE)
from optparse import OptionParser import os import re import sys cur_path, _ = os.path.split(__file__) sys.path.append(os.path.join(cur_path, '..', 'Experiments')) from experiment_support.preprocessing import sublexicalize BUF_SIZE = 8192 if __name__ == '__main__': parser = OptionParser() parser.add_option("-n", "--ngram-order", default=3) opts, args = parser.parse_args() order = int(opts.ngram_order) in_str = sys.stdin.read(BUF_SIZE) rest_str = "" while len(in_str) > 0: out_str = sublexicalize(rest_str + in_str.rstrip('\n'), order=order) rest_str = re.sub('_', ' ', out_str[-(order-1):]) sys.stdout.write(out_str + " ") in_str = sys.stdin.read(BUF_SIZE)