all_texts.append(texts) enc = EncodeLoad(args) out_dir = os.path.dirname(args.output) if not os.path.exists(out_dir): print(' - creating directory {}'.format(out_dir)) os.mkdir(out_dir) all_data = [] all_index = [] for l in args.lang: Token(os.path.join(args.base_dir, args.data + '.' + l), os.path.join(args.base_dir, args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, lower_case=True, verbose=args.verbose, over_write=False) BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l), os.path.join(args.base_dir, args.output + '.bpe.' + l), args.bpe_codes, verbose=args.verbose, over_write=False) EncodeFile(enc, os.path.join(args.base_dir, args.output + '.bpe.' + l), os.path.join(args.base_dir, args.output + '.enc.' + l), verbose=args.verbose, over_write=False) d, idx = IndexCreate(os.path.join(args.base_dir, args.output + '.enc.' + l),
############################################################ # load and preprocess input file BPE_CODES = LASER + '/models/93langs.fcodes' LANGUAGE_CODE = 'en' VERBOSE = True input_file = 'data/test_sentences.txt' tokenized_f = 'data/test_tokenized.txt' bpe_f = 'data/test_bpe.txt' # tokenize Token( input_file, tokenized_f, lang=LANGUAGE_CODE, romanize=False, #kept static for simplicity lower_case=True, gzip=False, verbose=VERBOSE, over_write=False) # BPE BPEfastApply(tokenized_f, bpe_f, BPE_CODES, verbose=VERBOSE, over_write=False) ############################################################ # Load + infer model ############################################################ model_path = LASER + '/models/bilstm.93langs.2018-12-26.pt' model = LASEREmbedderIV(model_path, LASERHiddenExtractor, 300, 100, 10) bpe_to_idx = torch.load(model_path)['dictionary']