def preprocess(voc_path, txt_path): assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) logger = create_logger(None, 0) bin_path = txt_path + ".pth" dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % (len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']))) if len(data['unk_words']) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % (sum(data['unk_words'].values()), len( data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])))) if len(data['unk_words']) < 30: for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]: logger.info("%s: %i" % (w, c))
def load_xlm_embeddings(path, model_name="model"): """ Load all xlm embeddings Params: path: model_name: model name in the reloaded path, "model" for pretrained xlm encoder; "encoder" for encoder of translation model "decoder" for decoder of translation model """ reloaded = torch.load(path) assert model_name in ["model", "encoder", "decoder"] state_dict = reloaded[model_name] # handle models from multi-GPU checkpoints state_dict = {(k[7:] if k.startswith('module.') else k): v for k, v in state_dict.items()} # reload dictionary and model parameters dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) pretrain_params = AttrDict(reloaded['params']) pretrain_params.n_words = len(dico) pretrain_params.bos_index = dico.index(BOS_WORD) pretrain_params.eos_index = dico.index(EOS_WORD) pretrain_params.pad_index = dico.index(PAD_WORD) pretrain_params.unk_index = dico.index(UNK_WORD) pretrain_params.mask_index = dico.index(MASK_WORD) # build model and reload weights if model_name != "decoder": model = TransformerModel(pretrain_params, dico, True, True) else: model = TransformerModel(pretrain_params, dico, False, True) model.load_state_dict(state_dict) return model.embeddings.weight.data, dico
def reload(path, params): """ Create a sentence embedder from a pretrained model. """ # reload model reloaded = torch.load(path) state_dict = reloaded['model'] # handle models from multi-GPU checkpoints if 'checkpoint' in path: state_dict = {(k[7:] if k.startswith('module.') else k): v for k, v in state_dict.items()} # reload dictionary and model parameters dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) pretrain_params = AttrDict(reloaded['params']) pretrain_params.n_words = len(dico) pretrain_params.bos_index = dico.index(BOS_WORD) pretrain_params.eos_index = dico.index(EOS_WORD) pretrain_params.pad_index = dico.index(PAD_WORD) pretrain_params.unk_index = dico.index(UNK_WORD) pretrain_params.mask_index = dico.index(MASK_WORD) # build model and reload weights model = TransformerModel(pretrain_params, dico, True, True) model.load_state_dict(state_dict) model.eval() # adding missing parameters params.max_batch_size = 0 return MyModel(model, dico, pretrain_params, params)
def initialize_model(): """ """ print('launching model') chemin = getcwd() curPath = chemin if "xlm" in chemin else (os.path.join(getcwd(), 'xlm')) onlyfiles = [f for f in listdir(chemin) if isfile(join(chemin, f))] print(onlyfiles) print(os.path.normpath(os.path.join(getcwd(), './mlm_tlm_xnli15_1024.pth'))) model_path = os.path.normpath( os.path.join(getcwd(), './mlm_tlm_xnli15_1024.pth')) reloaded = torch.load(model_path) # print('allez le model') # response = requests.get(url) # print('response downloaded') # f = io.BytesIO(response.content) # reloaded = torch.load(f) # print('file downloaded') # reloaded = Reloaded.serve() params = AttrDict(reloaded['params']) print("Supported languages: %s" % ", ".join(params.lang2id.keys())) # build dictionary / update parameters dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(dico) params.bos_index = dico.index(BOS_WORD) params.eos_index = dico.index(EOS_WORD) params.pad_index = dico.index(PAD_WORD) params.unk_index = dico.index(UNK_WORD) params.mask_index = dico.index(MASK_WORD) # build model / reload weights model = TransformerModel(params, dico, True, True) model.load_state_dict(reloaded['model']) # bpe = fastBPE.fastBPE( # path.normpath(path.join(curPath, "./codes_xnli_15") ), # path.normpath(path.join(curPath, "./vocab_xnli_15") ) ) print('fin lecture') return model, params, dico
def main(args): if args.table_label is None: args.table_label = args.table + "_label" if args.table_vocab is None: args.table_vocab = args.table + "_vocab" assert os.path.isfile(args.table) assert os.path.isfile(args.table_label) assert os.path.isfile(args.table_vocab) print_args(args) table_dico = Dictionary.read_vocab(args.table_vocab) table_data = Dictionary.index_table(args.table, args.table_label, table_dico, args.table + ".pth")
def __init__(self, model_path, tgt_lang, src_lang,dump_path = "./dumped/", exp_name="translate", exp_id="test", batch_size=32): # parse parameters parser = argparse.ArgumentParser(description="Translate sentences") # main parameters parser.add_argument("--dump_path", type=str, default=dump_path, help="Experiment dump path") parser.add_argument("--exp_name", type=str, default=exp_name, help="Experiment name") parser.add_argument("--exp_id", type=str, default=exp_id, help="Experiment ID") parser.add_argument("--batch_size", type=int, default=batch_size, help="Number of sentences per batch") # model / output paths parser.add_argument("--model_path", type=str, default=model_path, help="Model path") # parser.add_argument("--max_vocab", type=int, default=-1, help="Maximum vocabulary size (-1 to disable)") # parser.add_argument("--min_count", type=int, default=0, help="Minimum vocabulary count") # source language / target language parser.add_argument("--src_lang", type=str, default=src_lang, help="Source language") parser.add_argument("--tgt_lang", type=str, default=tgt_lang, help="Target language") params = parser.parse_args() assert params.src_lang != '' and params.tgt_lang != '' and params.src_lang != params.tgt_lang # initialize the experiment logger = initialize_exp(params) # On a pas de GPU #reloaded = torch.load(params.model_path) reloaded = torch.load(params.model_path, map_location=torch.device('cpu')) model_params = AttrDict(reloaded['params']) self.supported_languages = model_params.lang2id.keys() logger.info("Supported languages: %s" % ", ".join(self.supported_languages)) # update dictionary parameters for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights self.dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) #self.encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() self.encoder = TransformerModel(model_params, self.dico, is_encoder=True, with_output=True).eval() #self.decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() self.decoder = TransformerModel(model_params, self.dico, is_encoder=False, with_output=True).eval() self.encoder.load_state_dict(reloaded['encoder']) self.decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] self.model_params = model_params self.params = params
def main(args): if args.summary_vocab is None: args.summary_vocab = args.summary + "_vocab" if args.summary_label is None: args.summary_label = args.summary + "_label" assert os.path.isfile(args.summary) assert os.path.isfile(args.summary_vocab) assert os.path.isfile(args.summary_label) print_args(args) summary_dico = Dictionary.read_vocab(args.summary_vocab) summary_data = Dictionary.index_summary(args.summary, args.summary_label, summary_dico, args.summary + ".pth", max_len=args.summary_max_length)
def reload_ar_checkpoint(path): """ Reload autoregressive params, dictionary, model from a given path """ # Load dictionary/model/datasets first reloaded = torch.load(path) params = AttrDict(reloaded['params']) # build dictionary / update parameters dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(dico) params.n_langs = 1 params.bos_index = dico.index(BOS_WORD) params.eos_index = dico.index(EOS_WORD) params.pad_index = dico.index(PAD_WORD) params.unk_index = dico.index(UNK_WORD) params.mask_index = dico.index(MASK_WORD) # build Transformer model model = TransformerModel(params, is_encoder=False, with_output=True) model.load_state_dict(reloaded['model']) return params, dico, model
def reload_checkpoint(path): """ Reload params, dictionary, model from a given path """ # Load dictionary/model/datasets first reloaded = torch.load(path) params = AttrDict(reloaded['params']) print("Supported languages: %s" % ", ".join(params.lang2id.keys())) # build dictionary / update parameters dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(dico) params.bos_index = dico.index(BOS_WORD) params.eos_index = dico.index(EOS_WORD) params.pad_index = dico.index(PAD_WORD) params.unk_index = dico.index(UNK_WORD) params.mask_index = dico.index(MASK_WORD) # build model / reload weights model = TransformerModel(params, dico, True, True) model.load_state_dict(reloaded['model']) return params, dico, model
def load_model(params): # check parameters assert os.path.isdir(params.data_path) assert os.path.isfile(params.model_path) reloaded = torch.load(params.model_path) encoder_model_params = AttrDict(reloaded['enc_params']) decoder_model_params = AttrDict(reloaded['dec_params']) dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_langs = encoder_model_params['n_langs'] params.id2lang = encoder_model_params['id2lang'] params.lang2id = encoder_model_params['lang2id'] params.n_words = len(dico) params.bos_index = dico.index(BOS_WORD) params.eos_index = dico.index(EOS_WORD) params.pad_index = dico.index(PAD_WORD) params.unk_index = dico.index(UNK_WORD) params.mask_index = dico.index(MASK_WORD) encoder = TransformerModel(encoder_model_params, dico, is_encoder=True, with_output=False) decoder = TransformerModel(decoder_model_params, dico, is_encoder=False, with_output=True) def _process_state_dict(state_dict): return {(k[7:] if k.startswith('module.') else k): v for k, v in state_dict.items()} encoder.load_state_dict(_process_state_dict(reloaded['encoder'])) decoder.load_state_dict(_process_state_dict(reloaded['decoder'])) return encoder, decoder, dico
def create_binary(txt_path, bin_path, dico): data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % ( len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']) )) if len(data['unk_words']) > 0: logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data['unk_words'].values()), len(data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])) )) if len(data['unk_words']) < 30: for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]: logger.info("%s: %i" % (w, c)) else: logger.info("0 unknown word.")
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD from src.model.transformer import TransformerModel import subprocess from torch.nn.modules.distance import CosineSimilarity cm = CosineSimilarity(dim=0) #On initialise le modèle model_path = './mlm_tlm_xnli15_1024.pth' reloaded = torch.load(model_path) params = AttrDict(reloaded['params']) print("Supported languages: %s" % ", ".join(params.lang2id.keys())) # build dictionary / update parameters dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(dico) params.bos_index = dico.index(BOS_WORD) params.eos_index = dico.index(EOS_WORD) params.pad_index = dico.index(PAD_WORD) params.unk_index = dico.index(UNK_WORD) params.mask_index = dico.index(MASK_WORD) # build model / reload weights model = TransformerModel(params, dico, True, True) model.load_state_dict(reloaded['model']) local = False if os.environ["TERM_PROGRAM"] == "Apple_Terminal": local = True
import sys from src.logger import create_logger from src.data.dictionary import Dictionary if __name__ == "__main__": logger = create_logger(None, 0) voc_path = sys.argv[1] txt_path = sys.argv[2] bin_path = sys.argv[2] + ".pth" assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % ( len(data["sentences"]) - len(data["positions"]), len(data["dico"]), len(data["positions"]), )) if len(data["unk_words"]) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data["unk_words"].values()), len(data["unk_words"]), sum(data["unk_words"].values()) * 100.0 / (len(data["sentences"]) - len(data["positions"])),
torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() logger.info("Loading data...") logger.info('Building dictionary ...') data = pd.read_csv(config.train_file, sep='\t') if args.word: data = data['text'].values.tolist() else: data = data['text'].apply(lambda x: " ".join("".join(x.split()))) if args.dictionary is None: dictionary = Dictionary() dictionary.build_dictionary(data) del data joblib.dump(dictionary, config.root_path + '/model/vocab.bin') else: dictionary = joblib.load(args.dictionary) if not args.model.isupper(): tokenizer = config.tokenizer else: tokenizer = None logger.info('Making dataset & dataloader...') ### TODO # 1. 使用自定义的MyDataset, 创建DataLoader train_dataset = train_dataloader =
# src_txt_path = 'data/all.zh.bpe' # tgt_voc_path = 'data/vocab.en' # tgt_txt_path = 'data/all.en.bpe' # bin_path = 'data/cwmt.bin' src_voc_path = sys.argv[3] src_txt_path = sys.argv[1] tgt_voc_path = sys.argv[4] tgt_txt_path = sys.argv[2] bin_path = sys.argv[5] assert os.path.isfile(src_voc_path) assert os.path.isfile(src_txt_path) assert os.path.isfile(tgt_voc_path) assert os.path.isfile(tgt_txt_path) src_dico = Dictionary.read_vocab(src_voc_path) tgt_dico = Dictionary.read_vocab(tgt_voc_path) data = Dictionary.index_data(src_txt_path, tgt_txt_path, src_dico, tgt_dico, bin_path) if data is None: exit(0) logger.info("%i words (%i unique) in %i sentences." % (len(data['src_sentences']) - len(data['src_positions']), len(data['src_dico']), len(data['src_positions']))) logger.info("%i words (%i unique) in %i sentences." % (len(data['tgt_sentences']) - len(data['tgt_positions']), len(data['tgt_dico']), len(data['tgt_positions']))) if len(data['src_unk_words']) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." %
from src.logger import create_logger from src.data.dictionary import Dictionary if __name__ == '__main__': logger = create_logger(None, 0) voc_path = sys.argv[1] txt_path = sys.argv[2] bin_path = sys.argv[2] + '.pth' assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) if voc_path[-4:] == '.pth': reload = torch.load(voc_path) dico = Dictionary(id2word = reload['dico_id2word'], word2id = reload['dico_word2id'], counts = reload['dico_counts']) else: dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % ( len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']) )) if len(data['unk_words']) > 0: logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data['unk_words'].values()), len(data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions']))
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # float16 if params.fp16: assert torch.backends.cudnn.enabled encoder = network_to_half(encoder) decoder = network_to_half(decoder) # read sentences from stdin src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent)) f = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size]] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encoded = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False) encoded = encoded.transpose(0, 1) decoded, dec_lengths = decoder.generate(encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10)) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join([dico[sent[k].item()] for k in range(len(sent))]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) f.write(target + "\n") f.close()
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder = None # decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() # decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # read sentences from stdin src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent)) all_encodings = [] # For each sentence... for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [ torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it, deal with padding encodings = encoderouts(encoder, batch, lengths, langs) # batch is actually in original order, append each sent to all_encodings for idx in encodings: all_encodings.append(idx.cpu().numpy()) # Save all encodings to npy np.save(params.output_path, np.stack(all_encodings))
# 12.0149 | 12 | 0.3 | /checkpoint/guismay/dumped/clm_test3/10431904/train.log # 12.5228 | 18 | 0.1 | /checkpoint/guismay/dumped/clm_test2/10403079/train.log #%% # model_path = '/checkpoint/guismay/dumped/clm_test3/10431904/periodic-23.pth' model_path = '/checkpoint/guismay/dumped/clm_test3/10431904/periodic-23.pth' reloaded = torch.load(model_path) params = AttrDict(reloaded['params']) print("Supported languages: %s" % ", ".join(params.lang2id.keys())) #%% [markdown] # ## Build dictionary / update parameters / build model #%% # build dictionary / update parameters dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) assert params.n_words == len(dico) assert params.bos_index == dico.index(BOS_WORD) assert params.eos_index == dico.index(EOS_WORD) assert params.pad_index == dico.index(PAD_WORD) assert params.unk_index == dico.index(UNK_WORD) assert params.mask_index == dico.index(MASK_WORD) # build model / reload weights model = TransformerModel(params, dico, True, True) model.load_state_dict(reloaded['model']) model.cuda() model.eval() #%%
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # float16 if params.fp16: assert torch.backends.cudnn.enabled encoder = network_to_half(encoder) decoder = network_to_half(decoder) input_data = torch.load(params.input) eval_dataset = Dataset(input_data["sentences"], input_data["positions"], params) if params.subset_start is not None: assert params.subset_end eval_dataset.select_data(params.subset_start, params.subset_end) eval_dataset.remove_empty_sentences() eval_dataset.remove_long_sentences(params.max_len) n_batch = 0 out = io.open(params.output_path, "w", encoding="utf-8") inp_dump = io.open(os.path.join(params.dump_path, "input.txt"), "w", encoding="utf-8") logger.info("logging to {}".format(os.path.join(params.dump_path, 'input.txt'))) with open(params.output_path, "w", encoding="utf-8") as out: for batch in eval_dataset.get_iterator(shuffle=False): n_batch += 1 (x1, len1) = batch input_text = convert_to_text(x1, len1, input_data["dico"], params) inp_dump.write("\n".join(input_text)) inp_dump.write("\n") langs1 = x1.clone().fill_(params.src_id) # cuda x1, len1, langs1 = to_cuda(x1, len1, langs1) # encode source sentence enc1 = encoder("fwd", x=x1, lengths=len1, langs=langs1, causal=False) enc1 = enc1.transpose(0, 1) # generate translation - translate / convert to text max_len = int(1.5 * len1.max().item() + 10) if params.beam_size == 1: generated, lengths = decoder.generate(enc1, len1, params.tgt_id, max_len=max_len) else: generated, lengths = decoder.generate_beam( enc1, len1, params.tgt_id, beam_size=params.beam_size, length_penalty=params.length_penalty, early_stopping=params.early_stopping, max_len=max_len) hypotheses_batch = convert_to_text(generated, lengths, input_data["dico"], params) out.write("\n".join(hypotheses_batch)) out.write("\n") if n_batch % 100 == 0: logger.info("{} batches processed".format(n_batch)) out.close() inp_dump.close()
def run_xnlg(): params = get_params() # initialize the experiment / build sentence embedder logger = initialize_exp(params) if params.tokens_per_batch > -1: params.group_by_size = True # check parameters assert os.path.isdir(params.data_path) assert os.path.isfile(params.model_path) # tasks params.transfer_tasks = params.transfer_tasks.split(',') assert len(params.transfer_tasks) > 0 assert all([task in TASKS for task in params.transfer_tasks]) reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) params.n_langs = model_params['n_langs'] params.id2lang = model_params['id2lang'] params.lang2id = model_params['lang2id'] if "enc_params" in reloaded: encoder_model_params = AttrDict(reloaded["enc_params"]) elif params.n_enc_layers == model_params.n_layers or params.n_enc_layers == 0: encoder_model_params = model_params else: encoder_model_params = AttrDict(reloaded['params']) encoder_model_params.n_layers = params.n_enc_layers assert model_params.n_layers is not encoder_model_params.n_layers if "dec_params" in reloaded: decoder_model_params = AttrDict(reloaded["dec_params"]) elif params.n_dec_layers == model_params.n_layers or params.n_dec_layers == 0: decoder_model_params = model_params else: decoder_model_params = AttrDict(reloaded['params']) decoder_model_params.n_layers = params.n_dec_layers assert model_params.n_layers is not decoder_model_params.n_layers params.encoder_model_params = encoder_model_params params.decoder_model_params = decoder_model_params if params.emb_dim != -1: encoder_model_params.emb_dim = params.emb_dim decoder_model_params.emb_dim = params.emb_dim # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) for p in [params, encoder_model_params, decoder_model_params]: p.n_words = len(dico) p.bos_index = dico.index(BOS_WORD) p.eos_index = dico.index(EOS_WORD) p.pad_index = dico.index(PAD_WORD) p.unk_index = dico.index(UNK_WORD) p.mask_index = dico.index(MASK_WORD) encoder = TransformerModel(encoder_model_params, dico, is_encoder=True, with_output=False) decoder = TransformerModel(decoder_model_params, dico, is_encoder=False, with_output=True) def _process_state_dict(state_dict): return {(k[7:] if k.startswith('module.') else k): v for k, v in state_dict.items()} if params.no_init == "all": logger.info("All Models will not load state dict.!!!") elif params.reload_emb != "": logger.info("Reloading embedding from %s ..." % params.reload_emb) word2id, embeddings = read_txt_embeddings(logger, params.reload_emb) set_pretrain_emb(logger, encoder, dico, word2id, embeddings) set_pretrain_emb(logger, decoder, dico, word2id, embeddings) else: if "model" in reloaded: if params.no_init != "encoder": encoder.load_state_dict(_process_state_dict(reloaded['model']), strict=False) if params.no_init != "decoder": decoder.load_state_dict(_process_state_dict(reloaded['model']), strict=False) else: if params.no_init != "encoder": encoder.load_state_dict(_process_state_dict( reloaded['encoder']), strict=False) if params.no_init != "decoder": decoder.load_state_dict( _process_state_dict(reloaded['decoder'])) scores = {} # run for task in params.transfer_tasks: if task == "XQG": XQG_v3(encoder, decoder, scores, dico, params).run() elif task == "XSumm": XSumm(encoder, decoder, scores, dico, params).run()
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] logger.info("encoder: {}".format(encoder)) logger.info("decoder: {}".format(decoder)) # read sentences from stdin src_sent = [] with open(params.sentences_path, 'r') as file1: for line in file1: src_sent.append(line) logger.info("Read %i sentences from sentences file.Writing them to a src file. Translating ..." % len(src_sent)) f = io.open(params.output_path + 'src_sent', 'w', encoding='utf-8') for sentence in src_sent: f.write(sentence + "\n") f.close() logger.info("Wrote them to a src file") f = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size]] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encoded, _ = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False, encoder_only=False, extra_adapters_flag=True) encoded = encoded.transpose(0, 1) # decoded, dec_lengths = decoder.generate(encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10)) decoded, dec_lengths = decoder.generate_beam( encoded, lengths.cuda(), params.tgt_id, beam_size=params.beam_size, length_penalty=params.length_penalty, early_stopping=params.early_stopping, max_len=int(1.5 * lengths.cuda().max().item() + 10), extra_adapters_flag=True) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join([dico[sent[k].item()] for k in range(len(sent))]) #logger.info("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) if (i+j)%10000 == 0: logger.info("Translation of %i / %i:\n Source sentence: %s \n Translation: %s\n" % (i + j, len(src_sent), source, target)) # sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) f.write(target + "\n") f.close()
def main(args): rng = np.random.RandomState(0) # Make dump path if not os.path.exists(args.dump_path): subprocess.Popen("mkdir -p %s" % args.dump_path, shell=True).wait() else: if os.listdir(args.dump_path): m = "Directory {} is not empty.".format(args.dump_path) raise ValueError(m) if len(args.log_file): write_log = True else: write_log = False # load model parameters model_dir = os.path.dirname(args.load_model) params_path = os.path.join(model_dir, 'params.pkl') with open(params_path, "rb") as f: params = pickle.load(f) # load data parameters and model parameters from checkpoint checkpoint_path = os.path.join(model_dir, 'checkpoint.pth') assert os.path.isfile(checkpoint_path) data = torch.load( checkpoint_path, map_location=lambda storage, loc: storage.cuda(params.local_rank)) for k, v in data["params"].items(): params.__dict__[k] = v dico = Dictionary(data["dico_id2word"], data["dico_word2id"], data["dico_counts"]) # Print score for k, v in data["best_metrics"].items(): print("- {}: {}".format(k, v)) # Fix some of the params we pass to load_data params.debug_train = False params.max_vocab = -1 params.min_count = 0 params.tokens_per_batch = -1 params.max_batch_size = args.batch_size params.batch_size = args.batch_size # load data data = load_data(args.data_path, params) # Print data summary for (src, tgt), dataset in data['para'].items(): datatype = "Para data (%s)" % ( "WITHOUT labels" if dataset.labels is None else "WITH labels") m = '{: <27} - {: >12}:{: >10}'.format(datatype, '%s-%s' % (src, tgt), len(dataset)) print(m) # Fix some of the params we pass to the model builder params.reload_model = args.load_model # build model if params.encoder_only: model = build_model(params, dico) else: encoder, decoder = build_model(params, dico) model = encoder # Predict model = model.module if params.multi_gpu else model model.eval() start = time.time() for (src, tgt), dataset in data['para'].items(): path = os.path.join(args.dump_path, "{}-{}.pred".format(src, tgt)) scores_file = open(path, "w") lang1_id = params.lang2id[src] lang2_id = params.lang2id[tgt] diffs = [] nb_written = 0 for batch in dataset.get_iterator(False, group_by_size=False, n_sentences=-1, return_indices=False): (sent1, len1), (sent2, len2), labels = batch sent1, len1 = truncate(sent1, len1, params.max_len, params.eos_index) sent2, len2 = truncate(sent2, len2, params.max_len, params.eos_index) x, lengths, positions, langs = concat_batches(sent1, len1, lang1_id, sent2, len2, lang2_id, params.pad_index, params.eos_index, reset_positions=True) x, lengths, positions, langs = to_cuda(x, lengths, positions, langs) with torch.no_grad(): # Get sentence pair embedding h = model('fwd', x=x, lengths=lengths, positions=positions, langs=langs, causal=False)[0] CLF_ID1, CLF_ID2 = 8, 9 # very hacky, use embeddings to make weights for the classifier emb = (model.module if params.multi_gpu else model).embeddings.weight pred = F.linear(h, emb[CLF_ID1].unsqueeze(0), emb[CLF_ID2, 0]) pred = torch.sigmoid(pred) pred = pred.view(-1).cpu().numpy().tolist() for p, l1, l2 in zip(pred, len1, len2): if l1.item() == 0 and l2.item() == 0: scores_file.write("0.00000000\n") else: scores_file.write("{:.8f}\n".format(p)) nb_written += len(pred) if nb_written % 1000 == 0: elapsed = int(time.time() - start) lpss = elapsed % 60 lpsm = elapsed // 60 lpsh = lpsm // 60 lpsm = lpsm % 60 msg = "[{:02d}:{:02d}:{:02d} {}-{}]".format( lpsh, lpsm, lpss, src, tgt) msg += " {}/{} ({:.2f}%) sentences processed".format( nb_written, len(dataset), 100 * nb_written / len(dataset)) print(msg) if write_log: with open(args.log_file, "a") as fout: fout.write(msg + "\n") # Try reversing order if TEST_REVERSE: x, lengths, positions, langs = concat_batches( sent2, len2, lang2_id, sent1, len1, lang1_id, params.pad_index, params.eos_index, reset_positions=True) x, lengths, positions, langs = to_cuda(x, lengths, positions, langs) with torch.no_grad(): # Get sentence pair embedding h = model('fwd', x=x, lengths=lengths, positions=positions, langs=langs, causal=False)[0] CLF_ID1, CLF_ID2 = 8, 9 # very hacky, use embeddings to make weights for the classifier emb = (model.module if params.multi_gpu else model).embeddings.weight pred_rev = F.linear(h, emb[CLF_ID1].unsqueeze(0), emb[CLF_ID2, 0]) pred_rev = torch.sigmoid(pred_rev) pred_rev = pred_rev.view(-1).cpu().numpy().tolist() for p, pp in zip(pred, pred_rev): diffs.append(p - pp) if TEST_REVERSE: print( "Average absolute diff between score(l1,l2) and score(l2,l1): {}" .format(np.mean(np.abs(diffs)))) scores_file.close()
if __name__ == '__main__': readme = "" parser = argparse.ArgumentParser(description=readme) parser.add_argument('--summary', help="summary data") parser.add_argument('--summary_vocab', help="summary data vocab") parser.add_argument('--summary_label', help="summary data label") parser.add_argument('--summary_max_length', type=int, default=600, help="summmary maximum length") args = parser.parse_args() if args.summary_vocab is None: args.summary_vocab = args.summary + "_vocab" if args.summary_label is None: args.summary_label = args.summary + "_label" assert os.path.isfile(args.summary) assert os.path.isfile(args.summary_vocab) assert os.path.isfile(args.summary_label) print_args(args) summary_dico = Dictionary.read_vocab(args.summary_vocab) summary_data = Dictionary.index_summary(args.summary, args.summary_label, summary_dico, args.summary + ".pth", max_len=args.summary_max_length)
def main(params): # initialize the experiment logger = initialize_exp(params) parser = get_parser() params = parser.parse_args() models_path = params.model_path.split(',') # generate parser / parse parameters models_reloaded = [] for model_path in models_path: models_reloaded.append(torch.load(model_path)) model_params = AttrDict(models_reloaded[0]['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(models_reloaded[0]['dico_id2word'], models_reloaded[0]['dico_word2id'], models_reloaded[0]['dico_counts']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] encoders = [] decoders = [] def package_module(modules): state_dict = OrderedDict() for k, v in modules.items(): if k.startswith('module.'): state_dict[k[7:]] = v else: state_dict[k] = v return state_dict for reloaded in models_reloaded: encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).to(params.device).eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).to(params.device).eval() encoder.load_state_dict(package_module(reloaded['encoder'])) decoder.load_state_dict(package_module(reloaded['decoder'])) # float16 if params.fp16: assert torch.backends.cudnn.enabled encoder = network_to_half(encoder) decoder = network_to_half(decoder) encoders.append(encoder) decoders.append(decoder) #src_sent = ['Poly@@ gam@@ ie statt Demokratie .'] src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) f = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [ torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encodeds = [] for encoder in encoders: encoded = encoder('fwd', x=batch.to(params.device), lengths=lengths.to(params.device), langs=langs.to(params.device), causal=False) encoded = encoded.transpose(0, 1) encodeds.append(encoded) assert encoded.size(0) == lengths.size(0) decoded, dec_lengths = generate_beam( decoders, encodeds, lengths.to(params.device), params.tgt_id, beam_size=params.beam, length_penalty=params.length_penalty, early_stopping=False, max_len=int(1.5 * lengths.max().item() + 10), params=params) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join([dico[sent[k].item()] for k in range(len(sent))]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) f.write(target + "\n") f.close()
def main(params): # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) # update dictionary parameters for name in ['src_n_words', 'tgt_n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights source_dico = Dictionary(reloaded['source_dico_id2word'], reloaded['source_dico_word2id']) target_dico = Dictionary(reloaded['target_dico_id2word'], reloaded['target_dico_word2id']) encoder = TransformerEncoder(model_params, source_dico, with_output=False).cuda().eval() decoder = TransformerDecoder(model_params, target_dico, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) # read sentences from stdin table_lines = [] table_inf = open(params.table_path, 'r', encoding='utf-8') for table_line in table_inf: table_lines.append(table_line) outf = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(table_lines), params.batch_size): # prepare batch enc_x1_ids = [] enc_x2_ids = [] enc_x3_ids = [] enc_x4_ids = [] for table_line in table_lines[i:i + params.batch_size]: record_seq = [each.split('|') for each in table_line.split()] assert all([len(x) == 4 for x in record_seq]) enc_x1_ids.append(torch.LongTensor([source_dico.index(x[0]) for x in record_seq])) enc_x2_ids.append(torch.LongTensor([source_dico.index(x[1]) for x in record_seq])) enc_x3_ids.append(torch.LongTensor([source_dico.index(x[2]) for x in record_seq])) enc_x4_ids.append(torch.LongTensor([source_dico.index(x[3]) for x in record_seq])) enc_xlen = torch.LongTensor([len(x) + 2 for x in enc_x1_ids]) enc_x1 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index) enc_x1[0] = params.eos_index enc_x2 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index) enc_x2[0] = params.eos_index enc_x3 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index) enc_x3[0] = params.eos_index enc_x4 = torch.LongTensor(enc_xlen.max().item(), enc_xlen.size(0)).fill_(params.pad_index) enc_x4[0] = params.eos_index for j, (s1,s2,s3,s4) in enumerate(zip(enc_x1_ids, enc_x2_ids, enc_x3_ids, enc_x4_ids)): if enc_xlen[j] > 2: # if sentence not empty enc_x1[1:enc_xlen[j] - 1, j].copy_(s1) enc_x2[1:enc_xlen[j] - 1, j].copy_(s2) enc_x3[1:enc_xlen[j] - 1, j].copy_(s3) enc_x4[1:enc_xlen[j] - 1, j].copy_(s4) enc_x1[enc_xlen[j] - 1, j] = params.eos_index enc_x2[enc_xlen[j] - 1, j] = params.eos_index enc_x3[enc_xlen[j] - 1, j] = params.eos_index enc_x4[enc_xlen[j] - 1, j] = params.eos_index enc_x1 = enc_x1.cuda() enc_x2 = enc_x2.cuda() enc_x3 = enc_x3.cuda() enc_x4 = enc_x4.cuda() enc_xlen = enc_xlen.cuda() # encode source batch and translate it encoder_output = encoder('fwd', x1=enc_x1, x2=enc_x2, x3=enc_x3, x4=enc_x4, lengths=enc_xlen) encoder_output = encoder_output.transpose(0, 1) # max_len = int(1.5 * enc_xlen.max().item() + 10) max_len = 602 if params.beam_size <= 1: decoded, dec_lengths = decoder.generate(encoder_output, enc_xlen, max_len=max_len) elif params.beam_size > 1: decoded, dec_lengths = decoder.generate_beam(encoder_output, enc_xlen, params.beam_size, params.length_penalty, params.early_stopping, max_len=max_len) for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = table_lines[i + j].strip() target = " ".join([target_dico[sent[k].item()] for k in range(len(sent))]) sys.stderr.write("%i / %i: %s\n" % (i + j, len(table_lines), target)) outf.write(target + "\n") outf.close()
def print_args(args): print("table:\t{}".format(args.table)) print("table_label:\t{}".format(args.table_label)) print("table_vocab:\t{}".format(args.table_vocab)) if __name__ == '__main__': readme = "" parser = argparse.ArgumentParser(description=readme) parser.add_argument('--table', help="table dataOld") parser.add_argument('--table_label', help="table label") parser.add_argument('--table_vocab', help="table vocab") args = parser.parse_args() if args.table_label is None: args.table_label = args.table + "_label" if args.table_vocab is None: args.table_vocab = args.table + "_vocab" assert os.path.isfile(args.table) assert os.path.isfile(args.table_label) assert os.path.isfile(args.table_vocab) print_args(args) table_dico = Dictionary.read_vocab(args.table_vocab) table_data = Dictionary.index_table(args.table, args.table_label, table_dico, args.table + ".pth")
class Translate(): def __init__(self, model_path, tgt_lang, src_lang, dump_path="./dumped/", exp_name="translate", exp_id="test", batch_size=32): # parse parameters parser = argparse.ArgumentParser(description="Translate sentences") # main parameters parser.add_argument("--dump_path", type=str, default=dump_path, help="Experiment dump path") parser.add_argument("--exp_name", type=str, default=exp_name, help="Experiment name") parser.add_argument("--exp_id", type=str, default=exp_id, help="Experiment ID") parser.add_argument("--batch_size", type=int, default=batch_size, help="Number of sentences per batch") # model / output paths parser.add_argument("--model_path", type=str, default=model_path, help="Model path") # parser.add_argument("--max_vocab", type=int, default=-1, help="Maximum vocabulary size (-1 to disable)") # parser.add_argument("--min_count", type=int, default=0, help="Minimum vocabulary count") # source language / target language parser.add_argument("--src_lang", type=str, default=src_lang, help="Source language") parser.add_argument("--tgt_lang", type=str, default=tgt_lang, help="Target language") parser.add_argument('-d', "--text", type=str, default="", nargs='+', help="Text to be translated") params = parser.parse_args() assert params.src_lang != '' and params.tgt_lang != '' and params.src_lang != params.tgt_lang # initialize the experiment logger = initialize_exp(params) # On a pas de GPU #reloaded = torch.load(params.model_path) reloaded = torch.load(params.model_path, map_location=torch.device('cpu')) model_params = AttrDict(reloaded['params']) self.supported_languages = model_params.lang2id.keys() logger.info("Supported languages: %s" % ", ".join(self.supported_languages)) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: try: setattr(params, name, getattr(model_params, name)) except AttributeError: key = list(model_params.meta_params.keys())[0] attr = getattr(model_params.meta_params[key], name) setattr(params, name, attr) setattr(model_params, name, attr) # build dictionary / build encoder / build decoder / reload weights self.dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) #self.encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() self.encoder = TransformerModel(model_params, self.dico, is_encoder=True, with_output=True).eval() #self.decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() self.decoder = TransformerModel(model_params, self.dico, is_encoder=False, with_output=True).eval() self.encoder.load_state_dict(reloaded['encoder']) self.decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] self.model_params = model_params self.params = params def translate(self, src_sent=[]): flag = False if type(src_sent) == str: src_sent = [src_sent] flag = True tgt_sent = [] for i in range(0, len(src_sent), self.params.batch_size): # prepare batch word_ids = [ torch.LongTensor( [self.dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + self.params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_( self.params.pad_index) batch[0] = self.params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = self.params.eos_index langs = batch.clone().fill_(self.params.src_id) # encode source batch and translate it #encoded = self.encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False) encoded = self.encoder('fwd', x=batch, lengths=lengths, langs=langs, causal=False) encoded = encoded.transpose(0, 1) #decoded, dec_lengths = self.decoder.generate(encoded, lengths.cuda(), self.params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10)) decoded, dec_lengths = self.decoder.generate( encoded, lengths, self.params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10)) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == self.params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len( delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join( [self.dico[sent[k].item()] for k in range(len(sent))]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) tgt_sent.append(target) if flag: return tgt_sent[0] return tgt_sent
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() torch.manual_seed( params.seed ) # Set random seed. NB: Multi-GPU also needs torch.cuda.manual_seed_all(params.seed) assert (params.sample_temperature == 0) or (params.beam_size == 1), 'Cannot sample with beam search.' assert params.amp <= 1, f'params.amp == {params.amp} not yet supported.' reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=False).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() if all([k.startswith('module.') for k in reloaded['encoder'].keys()]): reloaded['encoder'] = { k[len('module.'):]: v for k, v in reloaded['encoder'].items() } encoder.load_state_dict(reloaded['encoder']) if all([k.startswith('module.') for k in reloaded['decoder'].keys()]): reloaded['decoder'] = { k[len('module.'):]: v for k, v in reloaded['decoder'].items() } decoder.load_state_dict(reloaded['decoder']) if params.amp != 0: models = apex.amp.initialize([encoder, decoder], opt_level=('O%i' % params.amp)) encoder, decoder = models params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # read sentences from stdin src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent)) # f = io.open(params.output_path, 'w', encoding='utf-8') hypothesis = [[] for _ in range(params.beam_size)] for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [ torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encoded = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False) encoded = encoded.transpose(0, 1) max_len = int(1.5 * lengths.max().item() + 10) if params.beam_size == 1: decoded, dec_lengths = decoder.generate( encoded, lengths.cuda(), params.tgt_id, max_len=max_len, sample_temperature=(None if params.sample_temperature == 0 else params.sample_temperature)) else: decoded, dec_lengths, all_hyp_strs = decoder.generate_beam( encoded, lengths.cuda(), params.tgt_id, beam_size=params.beam_size, length_penalty=params.length_penalty, early_stopping=params.early_stopping, max_len=max_len, output_all_hyps=True) # hypothesis.extend(convert_to_text(decoded, dec_lengths, dico, params)) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip().replace('<unk>', '<<unk>>') target = " ".join([dico[sent[k].item()] for k in range(len(sent)) ]).replace('<unk>', '<<unk>>') if params.beam_size == 1: hypothesis[0].append(target) else: for hyp_rank in range(params.beam_size): print( all_hyp_strs[j] [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1]) hypothesis[hyp_rank].append( all_hyp_strs[j] [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source.replace( '@@ ', ''), target.replace('@@ ', ''))) # f.write(target + "\n") # f.close() # export sentences to reference and hypothesis files / restore BPE segmentation save_dir, split = params.output_path.rsplit('/', 1) for hyp_rank in range(len(hypothesis)): hyp_name = f'hyp.st={params.sample_temperature}.bs={params.beam_size}.lp={params.length_penalty}.es={params.early_stopping}.seed={params.seed if (len(hypothesis) == 1) else str(hyp_rank)}.{params.src_lang}-{params.tgt_lang}.{split}.txt' hyp_path = os.path.join(save_dir, hyp_name) with open(hyp_path, 'w', encoding='utf-8') as f: f.write('\n'.join(hypothesis[hyp_rank]) + '\n') restore_segmentation(hyp_path) # evaluate BLEU score if params.ref_path: bleu = eval_moses_bleu(params.ref_path, hyp_path) logger.info("BLEU %s %s : %f" % (hyp_path, params.ref_path, bleu))
import torch from logging import getLogger from src.utils import AttrDict from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD from src.model.transformer import TransformerModel logger = getLogger() # NOTE: remember to replace the model path here model_path = './dumped/XLM_bora_es/abcedf/checkpoint.pth' reloaded = torch.load(model_path) params = AttrDict(reloaded['params']) print("Supported languages: %s" % ", ".join(params.lang2id.keys())) dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(dico) params.bos_index = dico.index(BOS_WORD) params.eos_index = dico.index(EOS_WORD) params.pad_index = dico.index(PAD_WORD) params.unk_index = dico.index(UNK_WORD) params.mask_index = dico.index(MASK_WORD) # build model / reload weights model = TransformerModel(params, dico, True, True) model.eval() model.load_state_dict(reloaded['model']) codes = "./data/processed/XLM_bora_es/60k/codes" # path to the codes of the model fastbpe = os.path.join(os.getcwd(), 'tools/fastBPE/fast')