예제 #1
0
            all_texts.append(texts)

enc = EncodeLoad(args)

out_dir = os.path.dirname(args.output)
if not os.path.exists(out_dir):
    print(' - creating directory {}'.format(out_dir))
    os.mkdir(out_dir)

all_data = []
all_index = []
for l in args.lang:
    Token(os.path.join(args.base_dir, args.data + '.' + l),
          os.path.join(args.base_dir, args.output + '.tok.' + l),
          lang=l,
          romanize=True if l == 'el' else False,
          lower_case=True,
          verbose=args.verbose,
          over_write=False)
    BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l),
                 os.path.join(args.base_dir, args.output + '.bpe.' + l),
                 args.bpe_codes,
                 verbose=args.verbose,
                 over_write=False)
    EncodeFile(enc,
               os.path.join(args.base_dir, args.output + '.bpe.' + l),
               os.path.join(args.base_dir, args.output + '.enc.' + l),
               verbose=args.verbose,
               over_write=False)
    d, idx = IndexCreate(os.path.join(args.base_dir,
                                      args.output + '.enc.' + l),
예제 #2
0
############################################################

# load and preprocess input file
BPE_CODES = LASER + '/models/93langs.fcodes'
LANGUAGE_CODE = 'en'
VERBOSE = True

input_file = 'data/test_sentences.txt'
tokenized_f = 'data/test_tokenized.txt'
bpe_f = 'data/test_bpe.txt'
# tokenize
Token(
    input_file,
    tokenized_f,
    lang=LANGUAGE_CODE,
    romanize=False,  #kept static for simplicity
    lower_case=True,
    gzip=False,
    verbose=VERBOSE,
    over_write=False)

# BPE
BPEfastApply(tokenized_f, bpe_f, BPE_CODES, verbose=VERBOSE, over_write=False)

############################################################
# Load + infer model
############################################################
model_path = LASER + '/models/bilstm.93langs.2018-12-26.pt'
model = LASEREmbedderIV(model_path, LASERHiddenExtractor, 300, 100, 10)
bpe_to_idx = torch.load(model_path)['dictionary']