def test_sublexicalize(self):
        result = sublexicalize("abc def ghi", order=3)
        self.assertEquals(result, "abc bc_ c_d _de def ef_ f_g _gh ghi")

        result = sublexicalize("abc def ghi", order=4)
        self.assertEquals(result, "abc_ bc_d c_de _def def_ ef_g f_gh _ghi")

        result = sublexicalize("abc def ghi", order=(3, 4))
        self.assertEquals(result, "abc bc_ abc_ c_d bc_d _de c_de def _def ef_ def_ f_g ef_g _gh f_gh ghi _ghi")
    def test_sublexicalize(self):
        result = sublexicalize("abc def ghi", order=3)
        self.assertEquals(result, "abc bc_ c_d _de def ef_ f_g _gh ghi")

        result = sublexicalize("abc def ghi", order=4)
        self.assertEquals(result, "abc_ bc_d c_de _def def_ ef_g f_gh _ghi")

        result = sublexicalize("abc def ghi", order=(3, 4))
        self.assertEquals(
            result,
            "abc bc_ abc_ c_d bc_d _de c_de def _def ef_ def_ f_g ef_g _gh f_gh ghi _ghi"
        )
def process(args):
    text, clean_func, order = args

    text = ' '.join(text)

    if clean_func:
        text = clean_func(text)

    return sublexicalize(text, order=order, join=False)
def process(args):
    text, clean_func, order = args

    text = ' '.join(text)

    if clean_func:
        text = clean_func(text)

    return sublexicalize(text, order=order, join=False)
예제 #5
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-e', '--encoding')
    parser.add_argument('-o', '--order', default="3")
    args = parser.parse_args()

    encoding = args.encoding
    order = parse_ngram_order(args.order)

    if encoding:
        sys.stdout=codecs.getwriter(encoding)(sys.stdout)
        sys.stdin=codecs.getreader(encoding)(sys.stdin)

    for text in sys.stdin:
            text = sublexicalize(text, order=order)

            sys.stdout.write(text)
            sys.stdout.write('\n')
def clean_c6(text_str):
    return sublexicalize(mahoney_clean(text_str), order=6)
예제 #7
0
from optparse import OptionParser
import os
import re
import sys

cur_path, _ = os.path.split(__file__)
sys.path.append(os.path.join(cur_path, '..', 'Experiments'))

from experiment_support.preprocessing import sublexicalize

BUF_SIZE = 8192

if __name__ == '__main__':
    parser = OptionParser()
    parser.add_option("-n", "--ngram-order", default=3)
    opts, args = parser.parse_args()

    order = int(opts.ngram_order)

    in_str = sys.stdin.read(BUF_SIZE)
    rest_str = ""

    while len(in_str) > 0:
        out_str = sublexicalize(rest_str + in_str.rstrip('\n'), order=order)
        rest_str = re.sub('_', ' ', out_str[-(order - 1):])

        sys.stdout.write(out_str + " ")

        in_str = sys.stdin.read(BUF_SIZE)
예제 #8
0
from optparse import OptionParser
import os
import re
import sys

cur_path, _ = os.path.split(__file__)
sys.path.append(os.path.join(cur_path, '..', 'Experiments'))

from experiment_support.preprocessing import sublexicalize

BUF_SIZE = 8192

if __name__ == '__main__':
    parser = OptionParser()
    parser.add_option("-n", "--ngram-order", default=3)
    opts, args = parser.parse_args()

    order = int(opts.ngram_order)

    in_str = sys.stdin.read(BUF_SIZE)
    rest_str = ""

    while len(in_str) > 0:
        out_str = sublexicalize(rest_str + in_str.rstrip('\n'), order=order)
        rest_str = re.sub('_', ' ', out_str[-(order-1):])

        sys.stdout.write(out_str + " ")

        in_str = sys.stdin.read(BUF_SIZE)
def clean_c6(text_str):
    return sublexicalize(mahoney_clean(text_str), order=6)