def Embed(tmpdir, ifname, encoder, token_lang, bpe_codes, buffer_size, verbose): output = os.path.join(tmpdir, 'emb') if token_lang != '--': tok_fname = os.path.join(tmpdir, 'tok') Token(ifname, tok_fname, lang=token_lang, romanize=True if token_lang == 'el' else False, lower_case=True, gzip=False, verbose=verbose, over_write=False) ifname = tok_fname if bpe_codes: bpe_fname = os.path.join(tmpdir, 'bpe') BPEfastApply(ifname, bpe_fname, bpe_codes, verbose=verbose, over_write=False) ifname = bpe_fname EncodeFile(encoder, ifname, output, verbose=verbose, over_write=False, buffer_size=buffer_size) return output
def extract(encoder, token_lang, bpe_codes, ifname, output, remove=False, verbose=False): with tempfile.TemporaryDirectory() as tmpdir: # ifname = '' if token_lang != '--': tok_fname = os.path.join(tmpdir, 'tok') Token(ifname, tok_fname, lang=token_lang, romanize=True if token_lang == 'el' else False, lower_case=True, gzip=False, verbose=verbose, over_write=False) ifname = tok_fname if bpe_codes: bpe_fname = os.path.join(tmpdir, 'bpe') BPEfastApply(ifname, bpe_fname, bpe_codes, verbose=verbose, over_write=True) ifname = bpe_fname EncodeFile(encoder, ifname, output, verbose=verbose, over_write=False, buffer_size=10000) return EmbedLoad(output)
def emb(encoder, inputF, outputF, verbose, buffer_size): EncodeFile(encoder, inputF, outputF, verbose=verbose, over_write=False, buffer_size=buffer_size)
def launch(self, lang): self.args.lang = lang all_data = [] all_index = [] for l in self.args.lang: Token(os.path.join(self.args.base_dir, self.args.data + '.' + l), os.path.join(self.args.base_dir, self.args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, lower_case=True, verbose=self.args.verbose, over_write=False) BPEfastApply(os.path.join(self.args.base_dir, self.args.output + '.tok.' + l), os.path.join(self.args.base_dir, self.args.output + '.bpe.' + l), self.args.bpe_codes, verbose=self.args.verbose, over_write=False) EncodeFile(self.enc, os.path.join(self.args.base_dir, self.args.output + '.bpe.' + l), os.path.join(self.args.base_dir, self.args.output + '.enc.' + l), verbose=self.args.verbose, over_write=False) d, idx = IndexCreate(os.path.join(self.args.base_dir, self.args.output + '.enc.' + l), 'FlatL2', verbose=self.args.verbose, save_index=False) all_data.append(d) all_index.append(idx) distances, indexes, cosine = IndexSearchMultiple(all_data, all_index, texts=all_texts, verbose=True, print_errors=False) print('D', distances) print('I', indexes) print('cosine', cosine) return distances, indexes, cosine
all_data = [] all_index = [] for l in args.lang: Token(os.path.join(args.base_dir, args.data + '.' + l), os.path.join(args.base_dir, args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, lower_case=True, verbose=args.verbose, over_write=False) BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l), os.path.join(args.base_dir, args.output + '.bpe.' + l), args.bpe_codes, verbose=args.verbose, over_write=False) EncodeFile(enc, os.path.join(args.base_dir, args.output + '.bpe.' + l), os.path.join(args.base_dir, args.output + '.enc.' + l), verbose=args.verbose, over_write=False) d, idx = IndexCreate(os.path.join(args.base_dir, args.output + '.enc.' + l), 'FlatL2', verbose=args.verbose, save_index=False) all_data.append(d) all_index.append(idx) err = IndexSearchMultiple(all_data, all_index, verbose=False) IndexPrintConfusionMatrix(err, args.lang)
enc = EncodeLoad(args) print('\nProcessing:') for part in ('train1000', 'dev', 'test'): # for lang in "en" if part == 'train1000' else args.lang: for lang in args.lang: cfname = os.path.join(args.data_dir, 'mldoc.' + part) Token(cfname + '.txt.' + lang, cfname + '.tok.' + lang, lang=lang, romanize=(True if lang == 'el' else False), lower_case=True, gzip=False, verbose=args.verbose, over_write=False) SplitLines(cfname + '.tok.' + lang, cfname + '.split.' + lang, cfname + '.sid.' + lang) BPEfastApply(cfname + '.split.' + lang, cfname + '.split.bpe.' + lang, args.bpe_codes, verbose=args.verbose, over_write=False) EncodeFile(enc, cfname + '.split.bpe.' + lang, cfname + '.split.enc.' + lang, verbose=args.verbose, over_write=False, buffer_size=args.buffer_size) JoinEmbed(cfname + '.split.enc.' + lang, cfname + '.sid.' + lang, cfname + '.enc.' + lang)