def extract(encoder, token_lang, bpe_codes, ifname, output, remove=False, verbose=False): with tempfile.TemporaryDirectory() as tmpdir: # ifname = '' if token_lang != '--': tok_fname = os.path.join(tmpdir, 'tok') Token(ifname, tok_fname, lang=token_lang, romanize=True if token_lang == 'el' else False, lower_case=True, gzip=False, verbose=verbose, over_write=False) ifname = tok_fname if bpe_codes: bpe_fname = os.path.join(tmpdir, 'bpe') BPEfastApply(ifname, bpe_fname, bpe_codes, verbose=verbose, over_write=True) ifname = bpe_fname EncodeFile(encoder, ifname, output, verbose=verbose, over_write=False, buffer_size=10000) return EmbedLoad(output)
def Embed(tmpdir, ifname, encoder, token_lang, bpe_codes, buffer_size, verbose): output = os.path.join(tmpdir, 'emb') if token_lang != '--': tok_fname = os.path.join(tmpdir, 'tok') Token(ifname, tok_fname, lang=token_lang, romanize=True if token_lang == 'el' else False, lower_case=True, gzip=False, verbose=verbose, over_write=False) ifname = tok_fname if bpe_codes: bpe_fname = os.path.join(tmpdir, 'bpe') BPEfastApply(ifname, bpe_fname, bpe_codes, verbose=verbose, over_write=False) ifname = bpe_fname EncodeFile(encoder, ifname, output, verbose=verbose, over_write=False, buffer_size=buffer_size) return output
def _bpe(self, src_input="../tmp/src.tok", tgt_input="../tmp/tgt.tok", src_tmp="../tmp/src.bpe", tgt_tmp="../tmp/tgt.bpe"): BPEfastApply(src_input, src_tmp, self.bpe_codes, verbose=True, over_write=False) BPEfastApply(tgt_input, tgt_tmp, self.bpe_codes, verbose=True, over_write=False)
def encode_file(input_filepath, output_filepath, language, bpe_codes_path): tokenized_filepath = get_temp_filepath() Token(str(input_filepath), str(tokenized_filepath), lang=language, romanize=True if language == 'el' else False) BPEfastApply(str(tokenized_filepath), str(output_filepath), str(bpe_codes_path)) tokenized_filepath.unlink()
def launch(self, lang): self.args.lang = lang all_data = [] all_index = [] for l in self.args.lang: Token(os.path.join(self.args.base_dir, self.args.data + '.' + l), os.path.join(self.args.base_dir, self.args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, lower_case=True, verbose=self.args.verbose, over_write=False) BPEfastApply(os.path.join(self.args.base_dir, self.args.output + '.tok.' + l), os.path.join(self.args.base_dir, self.args.output + '.bpe.' + l), self.args.bpe_codes, verbose=self.args.verbose, over_write=False) EncodeFile(self.enc, os.path.join(self.args.base_dir, self.args.output + '.bpe.' + l), os.path.join(self.args.base_dir, self.args.output + '.enc.' + l), verbose=self.args.verbose, over_write=False) d, idx = IndexCreate(os.path.join(self.args.base_dir, self.args.output + '.enc.' + l), 'FlatL2', verbose=self.args.verbose, save_index=False) all_data.append(d) all_index.append(idx) distances, indexes, cosine = IndexSearchMultiple(all_data, all_index, texts=all_texts, verbose=True, print_errors=False) print('D', distances) print('I', indexes) print('cosine', cosine) return distances, indexes, cosine
print(' - creating directory {}'.format(out_dir)) os.mkdir(out_dir) all_data = [] all_index = [] for l in args.lang: Token(os.path.join(args.base_dir, args.data + '.' + l), os.path.join(args.base_dir, args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, lower_case=True, verbose=args.verbose, over_write=False) BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l), os.path.join(args.base_dir, args.output + '.bpe.' + l), args.bpe_codes, verbose=args.verbose, over_write=False) EncodeFile(enc, os.path.join(args.base_dir, args.output + '.bpe.' + l), os.path.join(args.base_dir, args.output + '.enc.' + l), verbose=args.verbose, over_write=False) d, idx = IndexCreate(os.path.join(args.base_dir, args.output + '.enc.' + l), 'FlatL2', verbose=args.verbose, save_index=False) all_data.append(d) all_index.append(idx)
if args.token_lang != '--': ifile = os.path.join(tmpdir, 'tok') Token(args.input, ifile, lang=args.token_lang, romanize=True if args.token_lang == 'el' else False, lower_case=True, gzip=False, verbose=args.verbose, over_write=False) if args.bpe_codes: bpe_file = os.path.join(tmpdir, 'bpe') BPEfastApply(ifile, bpe_file, args.bpe_codes, verbose=args.verbose, over_write=False) ifile = bpe_file print(' - processing (batch size is {:d})'.format(args.buffer_size)) ifp = open(ifile, 'r') ofp = open(args.output, 'w') stats = namedtuple('stats', 'ns np') stats.nbs = 0 stats.nbp = 0 t = time.time() for sentences in buffered_read(ifp, args.buffer_size): embed = params.enc.encode_sentences(sentences) faiss.normalize_L2(embed) # call function for selected margin method
enc = EncodeLoad(args) print('\nProcessing:') for part in ('train1000', 'dev', 'test'): # for lang in "en" if part == 'train1000' else args.lang: for lang in args.lang: cfname = os.path.join(args.data_dir, 'mldoc.' + part) Token(cfname + '.txt.' + lang, cfname + '.tok.' + lang, lang=lang, romanize=(True if lang == 'el' else False), lower_case=True, gzip=False, verbose=args.verbose, over_write=False) SplitLines(cfname + '.tok.' + lang, cfname + '.split.' + lang, cfname + '.sid.' + lang) BPEfastApply(cfname + '.split.' + lang, cfname + '.split.bpe.' + lang, args.bpe_codes, verbose=args.verbose, over_write=False) EncodeFile(enc, cfname + '.split.bpe.' + lang, cfname + '.split.enc.' + lang, verbose=args.verbose, over_write=False, buffer_size=args.buffer_size) JoinEmbed(cfname + '.split.enc.' + lang, cfname + '.sid.' + lang, cfname + '.enc.' + lang)
def _vectorize(self, docs): """ Function for encoding senteces using the LASER model. Code was adapted from Arguments: docs: the documents to encode, an iterable lang: the language to encode """ embedding = '' if self.lang is None or not self.lang: lang = "en" print("Warning: using default language English") else: lang = self.lang # encoder model_dir = os.environ.get('LASER') + "models" encoder_path = model_dir + "/" + "bilstm.93langs.2018-12-26.pt" bpe_codes_path = model_dir + "/" + "93langs.fcodes" print(f' - Encoder: loading {encoder_path}') encoder = SentenceEncoder(encoder_path, max_sentences=None, max_tokens=12000, sort_kind='mergesort', cpu=True) with tempfile.TemporaryDirectory() as tmp: tmpdir = Path(tmp) bpe_fname = tmpdir / 'bpe' bpe_oname = tmpdir / 'out.raw' temp_infile = tmpdir / 'temp_in_docs.txt' np.savetxt(temp_infile, docs, fmt="%s") if lang != '--': tok_fname = tmpdir / "tok" Token(str(temp_infile), str(tok_fname), lang=lang, romanize=True if lang == 'el' else False, lower_case=True, gzip=False, verbose=True, over_write=False) ifname = tok_fname BPEfastApply(str(ifname), str(bpe_fname), str(bpe_codes_path), verbose=True, over_write=False) ifname = bpe_fname EncodeFile(encoder, str(ifname), str(bpe_oname), verbose=True, over_write=False, buffer_size=10000) dim = 1024 X = np.fromfile(str(bpe_oname), dtype=np.float32, count=-1) X.resize(X.shape[0] // dim, dim) embedding = X return X
def bpe(bpecodes, inputF, outputF, verbose): BPEfastApply(inputF, outputF, bpecodes, verbose=verbose, over_write=False)
LANGUAGE_CODE = 'en' VERBOSE = True input_file = 'data/test_sentences.txt' tokenized_f = 'data/test_tokenized.txt' bpe_f = 'data/test_bpe.txt' # tokenize Token( input_file, tokenized_f, lang=LANGUAGE_CODE, romanize=False, #kept static for simplicity lower_case=True, gzip=False, verbose=VERBOSE, over_write=False) # BPE BPEfastApply(tokenized_f, bpe_f, BPE_CODES, verbose=VERBOSE, over_write=False) ############################################################ # Load + infer model ############################################################ model_path = LASER + '/models/bilstm.93langs.2018-12-26.pt' model = LASEREmbedderIV(model_path, LASERHiddenExtractor, 300, 100, 10) bpe_to_idx = torch.load(model_path)['dictionary'] tokens = torch.LongTensor([[1, 2, 3], [4, 5, 6], [6, 7, 8], [7, 8, 9]]) embeddings = model(tokens) print(embeddings.size())