def load_marian_model(self) -> MarianMTModel: state_dict, cfg = self.state_dict, self.hf_config assert cfg.static_position_embeddings, "config.static_position_embeddings should be True" model = MarianMTModel(cfg) assert "hidden_size" not in cfg.to_dict() load_layers_( model.model.encoder.layers, state_dict, BART_CONVERTER, ) load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True) # handle tensors not associated with layers wemb_tensor = torch.nn.Parameter(torch.FloatTensor(self.wemb)) bias_tensor = torch.nn.Parameter(torch.FloatTensor(self.final_bias)) model.model.shared.weight = wemb_tensor model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared model.final_logits_bias = bias_tensor if "Wpos" in state_dict: print("Unexpected: got Wpos") wpos_tensor = torch.tensor(state_dict["Wpos"]) model.model.encoder.embed_positions.weight = wpos_tensor model.model.decoder.embed_positions.weight = wpos_tensor if cfg.normalize_embedding: assert "encoder_emb_ln_scale_pre" in state_dict raise NotImplementedError("Need to convert layernorm_embedding") assert not self.extra_keys, f"Failed to convert {self.extra_keys}" assert ( model.model.shared.padding_idx == self.pad_token_id ), f"Padding tokens {model.model.shared.padding_idx} and {self.pad_token_id} mismatched" return model
def combobox_changed(self): if self.inputComboBox.currentText() == "English": self.inputTextEdit.setAlignment(Qt.AlignLeft) if self.model_en is None: self.label.setText('جاري تحميل القاموس، الرجاء الانتظار') self.repaint() self.tokenizer_en = MarianTokenizer.from_pretrained( model_en_path) self.model_en = MarianMTModel.from_pretrained(model_en_path) self.label.setText('تم تحميل القاموس') elif self.inputComboBox.currentText() == "Russian": self.inputTextEdit.setAlignment(Qt.AlignLeft) if self.model_ru is None: self.label.setText('جاري تحميل القاموس، الرجاء الانتظار') self.repaint() self.tokenizer_ru = MarianTokenizer.from_pretrained( model_ru_path) self.model_ru = MarianMTModel.from_pretrained(model_ru_path) self.label.setText('تم تحميل القاموس') elif self.inputComboBox.currentText() == "Hebrew": self.inputTextEdit.setAlignment(Qt.AlignRight) if self.model_he is None: self.label.setText('جاري تحميل القاموس، الرجاء الانتظار') self.repaint() self.tokenizer_he = MarianTokenizer.from_pretrained( model_he_path) self.model_he = MarianMTModel.from_pretrained(model_he_path) self.label.setText('تم تحميل القاموس') self.repaint()
def load_marian_model(self) -> MarianMTModel: state_dict, cfg = self.state_dict, self.hf_config if not cfg.static_position_embeddings: raise ValueError( "config.static_position_embeddings should be True") model = MarianMTModel(cfg) if "hidden_size" in cfg.to_dict(): raise ValueError("hidden_size is in config") load_layers_( model.model.encoder.layers, state_dict, BART_CONVERTER, ) load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True) # handle tensors not associated with layers if self.cfg["tied-embeddings-src"]: wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb)) bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias)) model.model.shared.weight = wemb_tensor model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared else: wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb)) model.model.encoder.embed_tokens.weight = wemb_tensor decoder_wemb_tensor = nn.Parameter(torch.FloatTensor( self.dec_wemb)) bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias)) model.model.decoder.embed_tokens.weight = decoder_wemb_tensor model.final_logits_bias = bias_tensor if "Wpos" in state_dict: print("Unexpected: got Wpos") wpos_tensor = torch.tensor(state_dict["Wpos"]) model.model.encoder.embed_positions.weight = wpos_tensor model.model.decoder.embed_positions.weight = wpos_tensor if cfg.normalize_embedding: if not ("encoder_emb_ln_scale_pre" in state_dict): raise ValueError( "encoder_emb_ln_scale_pre is not in state dictionary") raise NotImplementedError("Need to convert layernorm_embedding") if self.extra_keys: raise ValueError(f"Failed to convert {self.extra_keys}") if model.get_input_embeddings().padding_idx != self.pad_token_id: raise ValueError( f"Padding tokens {model.get_input_embeddings().padding_idx} and {self.pad_token_id} mismatched" ) return model
def __init__( self, context, translation_artifacts_english, translation_artifacts_spanish, model="microsoft/DialoGPT-small", tokenizer="microsoft/DialoGPT-small", translate=True, sentiment_analisis=False, seed=44, ): """This is a deep learning chatbot with traduction Args: context (Chatbot): context traduction_english_artifacts (dict): Dictionary of artifacts traduction_spanish_artifacts (dict): Dictionary of artifacts translate (bool, optional): Input and output will be translated?. seed (int, optional): random seed. Defaults to 44. sentiment_analisis (bool, optional): """ self.generator = pipeline("text-generation", model=model, tokenizer=tokenizer) self.translate = translate self.context = context self.translation_artifacts_english = translation_artifacts_english self.translation_artifacts_spanish = translation_artifacts_spanish self.sentiment_analisis = sentiment_analisis self.parsed_context = self.generator.tokenizer.eos_token.join( context.split("\n")) self.temporal_context = [] set_seed(seed) if sentiment_analisis: self.sentiment_engine = SentimentIntensityAnalyzer() if translate: # ENG -> SPANISH self.model_name_en_t_es = "Helsinki-NLP/opus-mt-en-ROMANCE" self.tokenizer_en_t_es = MarianTokenizer.from_pretrained( self.model_name_en_t_es) self.model_en_t_es = MarianMTModel.from_pretrained( self.model_name_en_t_es) # ESP -> ENGLISH self.model_name_es_t_en = "Helsinki-NLP/opus-mt-ROMANCE-en" self.tokenizer_es_t_en = MarianTokenizer.from_pretrained( self.model_name_es_t_en) self.model_es_t_en = MarianMTModel.from_pretrained( self.model_name_es_t_en)
def get_model_tokenizer_files(romance_lang: str = "ROMANCE"): ROMANCE = romance_lang target_model_name = f"Helsinki-NLP/opus-mt-en-{ROMANCE}" target_tokenizer = MarianTokenizer.from_pretrained(target_model_name) target_model = MarianMTModel.from_pretrained(target_model_name) en_model_name = f"Helsinki-NLP/opus-mt-{ROMANCE}-en" en_tokenizer = MarianTokenizer.from_pretrained(en_model_name) en_model = MarianMTModel.from_pretrained(en_model_name) return en_model, en_tokenizer, target_model, target_tokenizer
def __init__(self, language): self.language = language target_model_name = f'Helsinki-NLP/opus-mt-en-{self.language}' self.target_tokenizer = MarianTokenizer.from_pretrained( target_model_name) self.target_model = MarianMTModel.from_pretrained( target_model_name).to('cuda') en_model_name = f'Helsinki-NLP/opus-mt-{self.language}-en' self.en_tokenizer = MarianTokenizer.from_pretrained(en_model_name) self.en_model = MarianMTModel.from_pretrained(en_model_name).to('cuda')
def _create_models_and_tokenizers(self): model1 = MarianMTModel.from_pretrained(self.models_list[0]) tokenizer1 = MarianTokenizer.from_pretrained(self.models_list[0]) if len(self.models_list) == 1: model2 = None tokenizer2 = None elif len(self.models_list) == 2: model2 = MarianMTModel.from_pretrained(self.models_list[1]) tokenizer2 = MarianTokenizer.from_pretrained(self.models_list[1]) else: raise ValueError("Is excpeted that the argiment models_list has the length 1 or 2") return model1,tokenizer1,model2,tokenizer2
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, required=True) parser.add_argument('--input_path', type=str, required=True) parser.add_argument('--output_path', type=str, required=True) parser.add_argument('--batch_size', type=int, default=32) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = MarianTokenizer.from_pretrained(args.model_name) model = MarianMTModel.from_pretrained(args.model_name).to(device) tgt = open(args.output_path, 'w') with open(args.input_path) as src: batch = [] for s in src.read().split('\n'): if len(s) == 0: continue batch.append(s) if len(batch) == args.batch_size: tgt_text = translate(model, tokenizer, device, batch) for t in tgt_text: tgt.write(t + '\n') batch = [] if len(batch) > 0: tgt_text = translate(model, tokenizer, device, batch) for t in tgt_text: tgt.write(t + '\n') tgt.close()
def translateDocx(source, target, file): global model_name global tokenizer global model if source == 'en' and target == 'de': model_name = 'Helsinki-NLP/opus-mt-en-de' if source == 'de' and target == 'en': model_name = 'Helsinki-NLP/opus-mt-de-en' tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) text = docx2txt.process(file).split("\n") text = list(filter(None, text)) text = [s for s in text if p.match(s)] text = [">>de<< " + s for s in text] i = 0 document = Document() for textblock in chunks(text, 2): i = i + 1 print("batch #%i (len: %s)" % (i, len(textblock)), file=sys.stderr) print("\t " + str(tuple(textblock))) target, duration = translat(textblock) document.add_paragraph(target) print("\t " + str(tuple(target))) print('translate took {:.3f} ms'.format(duration), file=sys.stderr) print("\n\n") end_time = time.time() duration = (start_time - end_time) * 1000.0 print('Total translate took {:.3f} ms'.format(duration)) return document
def get_docx_text(lst): global model_name global tokenizer global model model_name = 'Helsinki-NLP/opus-mt-en-de' tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) for str in lst: print(str) text = str.split("\n") text = list(filter(None, text)) text = [s for s in text if p.match(s)] text = [">>de<< " + s for s in text] i = 0 document = Document() for textblock in chunks(text, 5): i = i + 1 print("batch #%i (len: %s)" % (i, len(textblock)), file=sys.stderr) print("\t " + str(tuple(textblock))) target, duration = translat(textblock) document.add_paragraph(target) print("\t " + str(tuple(target))) print('translate took {:.3f} ms'.format(duration), file=sys.stderr) print("\n\n") end_time = time.time() document.save("new.docx")
def replace_string2(filename): global model_name global tokenizer global model model_name = 'Helsinki-NLP/opus-mt-en-de' tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) document = zipfile.ZipFile(filename) xml_content = document.read('word/document.xml') #document.close() tree = XML(xml_content) # using lxml instead of xml preserved the comments paragraphs = [] i = 0 for paragraph in tree.iter(PARA): i = i + 1 texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: #text = list(filter(None, text)) #text = [s for s in text if p.match(s)] #text = [">>de<< " + s for s in text] #print("%s: %s" %(i,texts)) target, duration = translat(texts) paragraph.text.replace(texts, target) document.save("new.docx")
def __init__(self, source_language: str, target_language: str): name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}" self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = MarianMTModel.from_pretrained(name).to(self.device) self.tokenizer = MarianTokenizer.from_pretrained(name)
def replace_string2(filename): global model_name global tokenizer global model model_name = 'Helsinki-NLP/opus-mt-en-de' tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) doc = Document(filename) for p in doc.paragraphs: textsL = [p.text] textsL = list(filter(None, textsL)) for text_value in textsL: textElem = [text_value] translated_as_L = translat(textElem) for translatedE in translated_as_L: print(text_value) print(translatedE) text = p.text.replace(text_value, translatedE) style = p.style p.text = text p.style = style # doc.save(filename) doc.save('test.docx') return 1
def main(): parser = argparse.ArgumentParser(description='translating using MarianMT in transformers huggingface library') parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='The input file (defaults to stdin)') parser.add_argument('-o', '--output', default=sys.stdout.fileno(), help='The output annotated file (defaults to stdout)') parser.add_argument('--encoding', default='utf-8', help='The character encoding for input/output ' '(it defaults to UTF-8)') parser.add_argument('--src', help='The source language') parser.add_argument('--tgt', help='The target language') parser.add_argument('-c, --chunks', help='Number of chunks to divide the corpus') args = parser.parse_args() f = open(args.input, encoding=args.encoding) output_file = open(args.output, mode='w', encoding=args.encoding) model_name = f"Helsinki-NLP/opus-mt-{args.src}-{args.tgt}" model = MarianMTModel.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name) with open(args.input, 'r', encoding=args.encoding) as f: src_lines = f.readlines() labels, words = get_label_doc(src_lines) tgt_text = translate(words, args.chunks, model, tokenizer) write_to_file(output_file, labels, tgt_text)
def main(args): with open(args.dataset_info, 'rb') as rf: dataset_info = pickle.load(rf) tokenizer = MarianTokenizer.from_pretrained(args.model_string) tokenizer.add_special_tokens({'pad_token': PAD_TOKEN}) pad_id = tokenizer.encode(PAD_TOKEN)[0] model = MarianMTModel.from_pretrained(args.model_string, return_dict=True).to(args.device) model.eval() checkpoint = torch.load(args.ckpt, map_location=args.device) model_args = checkpoint['args'] conditioning_model = Model(model_args, pad_id, len(dataset_info.index2word)) # no need to get the glove embeddings when reloading since they're saved in model ckpt anyway conditioning_model.load_state_dict(checkpoint['state_dict']) conditioning_model = conditioning_model.to(args.device) conditioning_model.eval() print("=> loaded checkpoint '{}' (epoch {})" .format(args.ckpt, checkpoint['epoch'])) print('num params', num_params(conditioning_model)) while True: results = predict_formality(model, tokenizer, conditioning_model, [args.input_text], dataset_info, precondition_topk=args.precondition_topk, do_sample=args.do_sample, length_cutoff=args.length_cutoff, condition_lambda=args.condition_lambda, device=args.device) print(results) import pdb; pdb.set_trace()
def load_model(self, model_name): if model_name in self.models: self.models[model_name]['last_loaded'] = time.time() return self.models[model_name]['tokenizer'], self.models[ model_name]['model'] else: logger.info("Load model: " + model_name) tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) model.eval() if len(self.models) >= self.max_loaded_models: oldest_time = time.time() oldest_model = None for model_name in self.models: if self.models[model_name]['last_loaded'] <= oldest_time: oldest_model = model_name oldest_time = self.models[model_name]['last_loaded'] del self.models[oldest_model] self.models[model_name] = { 'tokenizer': tokenizer, 'model': model, 'last_loaded': time.time() } return tokenizer, model
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, task="translation", model_args: Dict = {}, cache_dir: Optional[str] = None, freeze_encoder=False): super(EncDecModel, self).__init__() self.config_keys = ['max_seq_length'] self.max_seq_length = max_seq_length config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) self.model = MarianMTModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) self.tokenizer = MarianTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) self.config = self.model.config self.config_class = self.model.config_class #self.device = self.model.device self.dtype = self.model.dtype self.task = task self.output_attentions = True #self.output_hidden_states = True self.config.output_attentions = True #self.config.output_hidden_states = True self.freeze_encoder = freeze_encoder self.add_pooling_layer()
def __init__(self, model_name=None, device=None, half=False): """ basic wrapper around MarianMT model for language translation Args: model_name(str): Helsinki-NLP model device(str): device to use (e.g., 'cuda', 'cpu') half(bool): If True, use half precision. """ if 'Helsinki-NLP' not in model_name: raise ValueError( 'Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP' ) try: import torch except ImportError: raise Exception('Translator requires PyTorch to be installed.') self.torch_device = device if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' from transformers import MarianMTModel, MarianTokenizer self.tokenizer = MarianTokenizer.from_pretrained(model_name) self.model = MarianMTModel.from_pretrained(model_name).to( self.torch_device) if half: self.model = self.model.half()
def __init__(self): self.src = 'en' self.trg = 'fr' self.mname = f'Helsinki-NLP/opus-mt-{self.src}-{self.trg}' self.tokenizer = MarianTokenizer.from_pretrained(self.mname) self.model = MarianMTModel.from_pretrained(self.mname)
def test_generate_fp16(self): config, input_dict = self.model_tester.prepare_config_and_inputs() input_ids = input_dict["input_ids"] attention_mask = input_ids.ne(1).to(torch_device) model = MarianMTModel(config).eval().to(torch_device) if torch_device == "cuda": model.half() model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def TranslatePt2En(text): translation_model_name = f'Helsinki-NLP/opus-mt-roa-en' model = MarianMTModel.from_pretrained(translation_model_name) tokenizer = MarianTokenizer.from_pretrained(translation_model_name) # Translate the text inputs = tokenizer(text, return_tensors="pt", padding=True) gen = model.generate(**inputs) return tokenizer.batch_decode(gen, skip_special_tokens=True)
def __init__(self, target_language: str, device='auto'): super(BackTranslation, self).__init__() target_model_name = f'Helsinki-NLP/opus-mt-ko-{target_language}' self.tar_tokenizer = MarianTokenizer.from_pretrained(target_model_name) self.tar_model = MarianMTModel.from_pretrained(target_model_name) source_model_name = f'Helsinki-NLP/opus-mt-{target_language}-ko' self.src_tokenizer = MarianTokenizer.from_pretrained(source_model_name) self.src_model = MarianMTModel.from_pretrained(source_model_name) if device == 'auto': self.device = 'cuda' if torch.cuda.is_available() else 'cpu' else: self.device = device self.tar_model.to(device) self.src_model.to(device)
def __init__(self, src_lang: str, trg_lang: str): super(Marian, self).__init__() model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{trg_lang}' self.model = MarianMTModel.from_pretrained(model_name, normalize_embedding=True) self.tokenizer = MarianTokenizer.from_pretrained(model_name, bos_token='<bos>', eos_token='<eos>') self._added_tokens = []
def get_model(param): """ Load Hugginface marian Machine Translator model and tokenizer :param param: Huggingface MarianMt Helsinki-NLP/{model_name} to load (https://huggingface.co/Helsinki-NLP); param[0]=label - param[1]=model_name :return a tuple result = (Huggingface MarianMt Model, Marian MT Tokenizer, Marian MT label) """ mt_model = MarianMTModel.from_pretrained( param[1]) #param[0]=label ; param[1]=model_name to load mt_tokenizer = MarianTokenizer.from_pretrained(param[1]) #load tokenizer return mt_model, mt_tokenizer, param[0]
def __init__(self, src="en", trg="ro", use_cuda=True): self.src = src self.trg = trg self.use_cuda = use_cuda self.mname = f"Helsinki-NLP/opus-mt-{self.src}-{self.trg}" with torch.no_grad(): self.model = MarianMTModel.from_pretrained(self.mname) if self.use_cuda: self.model = self.model.cuda() self.tok = MarianTokenizer.from_pretrained(self.mname)
def load_model(self, route): model = f'opus-mt-{route}' path = os.path.join(self.models_dir,model) try: model = MarianMTModel.from_pretrained(path) tok = MarianTokenizer.from_pretrained(path) except: return 0,f"Make sure you have downloaded model for {route} translation" self.models[route] = (model,tok) return 1,f"Successfully loaded model for {route} transation"
def translate(sentences,inp_lang,out_lang): tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-{}-{}".format(inp_lang,out_lang)) model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-{}-{}".format(inp_lang,out_lang)) output = [] for sentence in tqdm(sentences): translated = model.generate(**tokenizer.prepare_translation_batch([sentence])) output.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated]) return output
def get_models( src: str, tgt: str, verbose: int = 0 ) -> Tuple[ transformers.models.marian.tokenization_marian.MarianTokenizer, transformers.models.marian.tokenization_marian.MarianTokenizer, transformers.models.marian.modeling_marian.MarianMTModel, transformers.models.marian.modeling_marian.MarianMTModel, ]: model_to = "Helsinki-NLP/opus-mt-{src}-{tgt}".format(src=SRC_TO, tgt=TGT_TO) model_from = "Helsinki-NLP/opus-mt-{src}-{tgt}".format(src=TGT_TO, tgt=SRC_TO) if verbose > 0: print("Loading models: {} and {}".format(model_to, model_from)) tokenizer_to = MarianTokenizer.from_pretrained(model_to) model_to = MarianMTModel.from_pretrained(model_to) tokenizer_from = MarianTokenizer.from_pretrained(model_from) model_from = MarianMTModel.from_pretrained(model_from) return tokenizer_to, tokenizer_from, model_to, model_from
def get_batch_opustranslator(src, tgt): from transformers import MarianTokenizer, MarianMTModel model_name = f'Helsinki-NLP/opus-mt-{src}-{tgt}' tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) translator = lambda x: tokenizer.batch_decode(model.generate( **tokenizer.prepare_seq2seq_batch(src_texts=x, return_tensors="pt")), skip_special_tokens=True) return translator
def __init__(self, src_lang: str = "en", tgt_lang: str = "fr", device: str = "cpu"): super().__init__(src_lang, tgt_lang, device) self.model_name = "Helsinki-NLP/opus-mt-{src}-{tgt}".format( src=src_lang, tgt=tgt_lang) self.tokenizer = MarianTokenizer.from_pretrained(self.model_name) self.model = MarianMTModel.from_pretrained(self.model_name) self.model = self.model.to(self.device)