def addTranslation(alljobs, outputFile, method="google", fallback=True, maxWrite=-1): if method == "google": from googletrans import Translator modelGoogle = Translator() if method == "easynmt" or fallback: from easynmt import EasyNMT modelNMT = EasyNMT('opus-mt') for i, entry in enumerate(alljobs[:maxWrite]): entry["CLEANED_JOBS"] = textCleaner(entry["CONTENT"]) # Seems to be a maximum number of characters allowed for translation # Split in half at a sentence boundary sentences = entry["CLEANED_JOBS"].split(". ") chunks = [". ".join(sentences)] entry["TRANSLATED_JOBS"] = "" if len(entry["CLEANED_JOBS"]) > 5000: mid = int(len(sentences) / 2) chunks = [". ".join(sentences[:mid]), ". ".join(sentences[mid:])] for chunk in chunks: if method == "google": try: translate = modelGoogle.translate(chunk, dest="en") translate = translate.text except (TypeError, AttributeError, IndexError, ReadTimeout) as e: print("Failed to translate entry", i) print("Error was:", e) if fallback: print("Trying instead with EasyNMT") translate = modelNMT.translate(chunk, target_lang="en") else: print("Leaving untranslated!") translate = entry["CLEANED_JOBS"] else: translate = modelNMT.translate(chunk, target_lang="en") entry["TRANSLATED_JOBS"] += translate entry["TOKENIZED_JOBS"] = sentenceSplitter(entry["TRANSLATED_JOBS"]) if outputFile: with open(outputFile, "w") as f: json.dump(alljobs[:maxWrite], f) return alljobs
class SentenceSimilarity_translationBased(SentenceSimilarity_abstract): def __init__(self): self.vectorizer = Vectorizer() self.translationModel = EasyNMT('opus-mt') self.targetLanguage = "en" # this function computes the similarity between two sentences, the more similar the two snetends are the lower the # computed score is def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB): sourceLanguageA = self.translationModel.language_detection(sentenceA) translationsA = self.translationModel.translate( [sentenceA], source_lang=sourceLanguageA, target_lang=self.targetLanguage) sourceLanguageB = self.translationModel.language_detection(sentenceB) translationsB = self.translationModel.translate( [sentenceB], source_lang=sourceLanguageB, target_lang=self.targetLanguage) sentences = [translationsA[0], translationsB[0]] self.vectorizer.bert(sentences) vectors = self.vectorizer.vectors embeddingOf_sentenceA = vectors[0] embeddingOf_sentenceB = vectors[1] print("\nsentenceA \"" + sentenceA + "\" --- sourceLanguageA=" + sourceLanguageA + " --- translation = " + translationsA[0]) print("sentenceB \"" + sentenceB + "\" --- sourceLanguageB=" + sourceLanguageB + " --- translation = " + translationsB[0]) distance = spatial.distance.cosine(embeddingOf_sentenceA, embeddingOf_sentenceB) return distance
def translateParagraph(chunk, method, fallback=True): if method == "google": from googletrans import Translator modelGoogle = Translator() if method == "easynmt" or fallback: from easynmt import EasyNMT modelNMT = EasyNMT('opus-mt') if method == "google": try: translate = modelGoogle.translate(chunk, dest="en") translate = translate.text except (TypeError, AttributeError, IndexError, ReadTimeout) as e: print("Failed to translate entry") print("Error was:", e) if fallback: print("Trying instead with EasyNMT") translate = modelNMT.translate(chunk, target_lang="en") else: print("Leaving untranslated!") translate = chunk else: translate = modelNMT.translate(chunk, target_lang="en") return translate
First documented in the 13th century and at the crossing of two important historic trade routes,[11] Berlin became the capital of the Margraviate of Brandenburg (1417–1701), the Kingdom of Prussia (1701–1918), the German Empire (1871–1918), the Weimar Republic (1919–1933), and the Third Reich (1933–1945).[12] Berlin in the 1920s was the third-largest municipality in the world.[13] After World War II and its subsequent occupation by the victorious countries, the city was divided; West Berlin became a de facto West German exclave, surrounded by the Berlin Wall (1961–1989) and East German territory.[14] East Berlin was declared capital of East Germany, while Bonn became the West German capital. Following German reunification in 1990, Berlin once again became the capital of all of Germany. Berlin is a world city of culture, politics, media and science.[15][16][17][18] Its economy is based on high-tech firms and the service sector, encompassing a diverse range of creative industries, research facilities, media corporations and convention venues.[19][20] Berlin serves as a continental hub for air and rail traffic and has a highly complex public transportation network. The metropolis is a popular tourist destination.[21] Significant industries also include IT, pharmaceuticals, biomedical engineering, clean tech, biotechnology, construction and electronics.""", """This is the second document. With some test sentences. This document contains also a list, which we want to translate. This is a list: - Berlin is a city in Germany - Paris is the capital of France - London is a large city in England This is my final sentence.""", """Mon document final est rédigé en français. Nous voulons donc vérifier si l'étape de détection de la langue fonctionne.""" ] from easynmt import EasyNMT target_lang = 'de' # We want to translate the sentences to German (de) model = EasyNMT('opus-mt') translations = model.translate(documents, target_lang=target_lang, batch_size=8, beam_size=3) for doc in translations: print(doc) print("\n======\n")
if row['dataset'] == 'SNLI' and snli < snli_max_sentences: sentences.add(row['sentence1']) snli += 1 if row['dataset'] == 'MNLI' and mnli < mnli_max_sentences: sentences.add(row['sentence1']) mnli += 1 if snli >= snli_max_sentences and mnli >= mnli_max_sentences: break print("Sentences:", len(sentences)) sentences = list(sentences) # Some warm up model.translate(sentences[0:100], source_lang='en', target_lang='de') # Start translation speed measure - Single process start_time = time.time() step_size = 4000 for start_idx in range(0, len(sentences), step_size): translations_single_p = model.translate(sentences[start_idx:start_idx+step_size], source_lang='en', target_lang='de', show_progress_bar=True) end_time = time.time() print("Single-Process translation done after {:.2f} sec. {:.2f} sentences / second".format(end_time - start_time, len(sentences) / (end_time - start_time))) ######## Multi-Process-Translation del model
""" This example tests the translation with all available models. """ from easynmt import EasyNMT available_models = ['opus-mt', 'mbart50_m2m', 'm2m_100_418M', 'm2m_100_1.2B'] for model_name in available_models: print("\n\nLoad model:", model_name) model = EasyNMT(model_name) sentences = [ 'In dieser Liste definieren wir mehrere Sätze.', 'Jeder dieser Sätze wird dann in die Zielsprache übersetzt.', 'Puede especificar en esta lista la oración en varios idiomas.', 'El sistema detectará automáticamente el idioma y utilizará el modelo correcto.' ] translations = model.translate(sentences, target_lang='en') print("Translations:") for sent, trans in zip(sentences, translations): print(sent) print("=>", trans, "\n")
""" This example shows how EasyNMT can be used for sentence translation """ from easynmt import EasyNMT sentences = ['Voici un exemple d\'utilisation d\'EasyNMT.', #'This is an example how to use EasyNMT.', 'You can define a list of sentences.', 'Cada frase es luego traducida al idioma de destino seleccionado.', #'Each sentences is then translated to your chosen target language.', 'On our website, you can find various translation models.', 'New York City (NYC), often called simply New York, is the most populous city in the United States.', 'PyTorch is an open source machine learning library based on the Torch library, used for applications such as computer vision and natural language processing, primarily developed by Facebook\'s AI Research lab (FAIR).', 'A deep neural network (DNN) is an artificial neural network (ANN) with multiple layers between the input and output layers.'] target_lang = 'de' # We want to translate the sentences to German (de) model = EasyNMT('opus-mt') translations = model.translate(sentences, target_lang=target_lang, batch_size=8, beam_size=3) print(translations)
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['sentence1'] in sentences or len(row['sentence1']) > 200: continue if len(model.sentence_splitting(row['sentence1'])) > 1: continue if row['dataset'] == 'SNLI' and snli < snli_max_sentences: sentences.add(row['sentence1']) snli += 1 if row['dataset'] == 'MNLI' and mnli < mnli_max_sentences: sentences.add(row['sentence1']) mnli += 1 if snli >= snli_max_sentences and mnli >= mnli_max_sentences: break print("Sentences:", len(sentences)) sentences = list(sentences) #Some warm up model.translate(sentences[0:100], source_lang='en', target_lang='de', perform_sentence_splitting=False) #Start translation speed measure start_time = time.time() model.translate(sentences, source_lang='en', target_lang='de', batch_size=64, show_progress_bar=True, perform_sentence_splitting=False) end_time = time.time() print("Done after {:.2f} sec. {:.2f} sentences / second".format(end_time-start_time, len(sentences) / (end_time-start_time)))