def addTranslation(alljobs, outputFile, method="google", fallback=True, maxWrite=-1): if method == "google": from googletrans import Translator modelGoogle = Translator() if method == "easynmt" or fallback: from easynmt import EasyNMT modelNMT = EasyNMT('opus-mt') for i, entry in enumerate(alljobs[:maxWrite]): entry["CLEANED_JOBS"] = textCleaner(entry["CONTENT"]) # Seems to be a maximum number of characters allowed for translation # Split in half at a sentence boundary sentences = entry["CLEANED_JOBS"].split(". ") chunks = [". ".join(sentences)] entry["TRANSLATED_JOBS"] = "" if len(entry["CLEANED_JOBS"]) > 5000: mid = int(len(sentences) / 2) chunks = [". ".join(sentences[:mid]), ". ".join(sentences[mid:])] for chunk in chunks: if method == "google": try: translate = modelGoogle.translate(chunk, dest="en") translate = translate.text except (TypeError, AttributeError, IndexError, ReadTimeout) as e: print("Failed to translate entry", i) print("Error was:", e) if fallback: print("Trying instead with EasyNMT") translate = modelNMT.translate(chunk, target_lang="en") else: print("Leaving untranslated!") translate = entry["CLEANED_JOBS"] else: translate = modelNMT.translate(chunk, target_lang="en") entry["TRANSLATED_JOBS"] += translate entry["TOKENIZED_JOBS"] = sentenceSplitter(entry["TRANSLATED_JOBS"]) if outputFile: with open(outputFile, "w") as f: json.dump(alljobs[:maxWrite], f) return alljobs
def load_model(model_name='m2m_100_418M'): """ EasyNMT model to load :param model_name: name of the model to load - List of supported model visit: https://github.com/UKPLab/EasyNMT#available-models :return Machine translation model """ return EasyNMT(model_name)
class SentenceSimilarity_translationBased(SentenceSimilarity_abstract): def __init__(self): self.vectorizer = Vectorizer() self.translationModel = EasyNMT('opus-mt') self.targetLanguage = "en" # this function computes the similarity between two sentences, the more similar the two snetends are the lower the # computed score is def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB): sourceLanguageA = self.translationModel.language_detection(sentenceA) translationsA = self.translationModel.translate( [sentenceA], source_lang=sourceLanguageA, target_lang=self.targetLanguage) sourceLanguageB = self.translationModel.language_detection(sentenceB) translationsB = self.translationModel.translate( [sentenceB], source_lang=sourceLanguageB, target_lang=self.targetLanguage) sentences = [translationsA[0], translationsB[0]] self.vectorizer.bert(sentences) vectors = self.vectorizer.vectors embeddingOf_sentenceA = vectors[0] embeddingOf_sentenceB = vectors[1] print("\nsentenceA \"" + sentenceA + "\" --- sourceLanguageA=" + sourceLanguageA + " --- translation = " + translationsA[0]) print("sentenceB \"" + sentenceB + "\" --- sourceLanguageB=" + sourceLanguageB + " --- translation = " + translationsB[0]) distance = spatial.distance.cosine(embeddingOf_sentenceA, embeddingOf_sentenceB) return distance
def translateParagraph(chunk, method, fallback=True): if method == "google": from googletrans import Translator modelGoogle = Translator() if method == "easynmt" or fallback: from easynmt import EasyNMT modelNMT = EasyNMT('opus-mt') if method == "google": try: translate = modelGoogle.translate(chunk, dest="en") translate = translate.text except (TypeError, AttributeError, IndexError, ReadTimeout) as e: print("Failed to translate entry") print("Error was:", e) if fallback: print("Trying instead with EasyNMT") translate = modelNMT.translate(chunk, target_lang="en") else: print("Leaving untranslated!") translate = chunk else: translate = modelNMT.translate(chunk, target_lang="en") return translate
First documented in the 13th century and at the crossing of two important historic trade routes,[11] Berlin became the capital of the Margraviate of Brandenburg (1417–1701), the Kingdom of Prussia (1701–1918), the German Empire (1871–1918), the Weimar Republic (1919–1933), and the Third Reich (1933–1945).[12] Berlin in the 1920s was the third-largest municipality in the world.[13] After World War II and its subsequent occupation by the victorious countries, the city was divided; West Berlin became a de facto West German exclave, surrounded by the Berlin Wall (1961–1989) and East German territory.[14] East Berlin was declared capital of East Germany, while Bonn became the West German capital. Following German reunification in 1990, Berlin once again became the capital of all of Germany. Berlin is a world city of culture, politics, media and science.[15][16][17][18] Its economy is based on high-tech firms and the service sector, encompassing a diverse range of creative industries, research facilities, media corporations and convention venues.[19][20] Berlin serves as a continental hub for air and rail traffic and has a highly complex public transportation network. The metropolis is a popular tourist destination.[21] Significant industries also include IT, pharmaceuticals, biomedical engineering, clean tech, biotechnology, construction and electronics.""", """This is the second document. With some test sentences. This document contains also a list, which we want to translate. This is a list: - Berlin is a city in Germany - Paris is the capital of France - London is a large city in England This is my final sentence.""", """Mon document final est rédigé en français. Nous voulons donc vérifier si l'étape de détection de la langue fonctionne.""" ] from easynmt import EasyNMT target_lang = 'de' # We want to translate the sentences to German (de) model = EasyNMT('opus-mt') translations = model.translate(documents, target_lang=target_lang, batch_size=8, beam_size=3) for doc in translations: print(doc) print("\n======\n")
import csv import sys import time import logging if __name__ == '__main__': logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, stream=sys.stdout, ) model = EasyNMT(sys.argv[1]) nli_dataset_path = 'AllNLI.tsv.gz' sentences = set() snli_max_sentences = 2000 mnli_max_sentences = 2000 snli = 0 mnli = 0 # Download datasets if needed if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) IS_BACKEND = os.getenv('ROLE', 'FRONT') == 'BACKEND' BACKEND_URL = os.getenv('BACKEND_URL', 'http://*****:*****@app.get("/translate") async def translate(target_lang: str, text: List[str] = Query([]), source_lang: Optional[str] = '', beam_size: Optional[int] = 5, perform_sentence_splitting: Optional[bool] = True): """ Translation :param text: Text that should be translated :param target_lang: Target language :param source_lang: Language of text (optional) :param beam_size: Beam size :param perform_sentence_splitting: Split longer documents into individual sentences for translation
import shutil import os import re from werkzeug.utils import secure_filename from flask import Flask, flash, request, redirect, send_file, render_template, url_for from nltk.tokenize import word_tokenize from easynmt import EasyNMT import nltk import pprint import secrets import pyconll as pc import torch import gc # model = EasyNMT('m2m_100_1.2B') model = EasyNMT('opus-mt') nltk.download('punkt') torch.cuda.init() """ sentences = ['Dies ist ein Satz in Deutsch.', # This is a German sentence '这是一个中文句子', # This is a chinese sentence 'Esta es una oración en español.'] # This is a spanish sentence print(model.translate(sentences, target_lang='en', batch_size=1)) """ def to_conllu(sentence): lines = sentence.split('\n') ok = ""
def __init__(self): self.vectorizer = Vectorizer() self.translationModel = EasyNMT('opus-mt') self.targetLanguage = "en"
""" This example tests the translation with all available models. """ from easynmt import EasyNMT available_models = ['opus-mt', 'mbart50_m2m', 'm2m_100_418M', 'm2m_100_1.2B'] for model_name in available_models: print("\n\nLoad model:", model_name) model = EasyNMT(model_name) sentences = [ 'In dieser Liste definieren wir mehrere Sätze.', 'Jeder dieser Sätze wird dann in die Zielsprache übersetzt.', 'Puede especificar en esta lista la oración en varios idiomas.', 'El sistema detectará automáticamente el idioma y utilizará el modelo correcto.' ] translations = model.translate(sentences, target_lang='en') print("Translations:") for sent, trans in zip(sentences, translations): print(sent) print("=>", trans, "\n")
tar_filepath) with tarfile.open(tar_filepath, "r:gz") as tar: tar.extractall(path=data_folder) with open(queries_filepath, 'r', encoding='utf8') as fIn: for line in fIn: qid, query = line.strip().split("\t") if qid in train_queries: train_queries[qid] = query.strip() qids = [qid for qid in train_queries if train_queries[qid] is not None] queries = [train_queries[qid] for qid in qids] #Define our translation model translation_model = EasyNMT('opus-mt') print("Start translation of {} queries.".format(len(queries))) print("This can take a while. But you can stop this script at any point") with open(output_filename, 'a' if os.path.exists(output_filename) else 'w', encoding='utf8') as fOut: for qid, query, translated_query in zip( qids, queries, translation_model.translate_stream( queries, source_lang='en', target_lang=target_lang, beam_size=2, perform_sentence_splitting=False,
""" This script measures the translation speed. Usage: python translation_speed.py model_name """ import os from easynmt import util, EasyNMT import gzip import csv import sys import time model = EasyNMT(sys.argv[1]) nli_dataset_path = 'AllNLI.tsv.gz' sentences = set() snli_max_sentences = 1000 mnli_max_sentences = 1000 snli = 0 mnli = 0 #Download datasets if needed if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader:
""" This example shows how EasyNMT to stream translations. Streaming translations can be useful when you want to translate a large set of sentences / documents. The method chunks the data, translates it, and returns the results. This can be useful if you want to write it e.g. to a file. """ from easynmt import EasyNMT #First, we create a large set of sentences: sentences = ['This is sentence ' + str(i) for i in range(10000)] target_lang = 'de' # We want to translate the sentences to German (de) model = EasyNMT('opus-mt') #The method model.translate_stream chunks the data into sets of size chunk_size #It then translate these documents/sentences and yields the result for translation in model.translate_stream(sentences, show_progress_bar=False, chunk_size=16, target_lang=target_lang): print(translation)