Пример #1
0
def addTranslation(alljobs,
                   outputFile,
                   method="google",
                   fallback=True,
                   maxWrite=-1):
    if method == "google":
        from googletrans import Translator
        modelGoogle = Translator()
    if method == "easynmt" or fallback:
        from easynmt import EasyNMT
        modelNMT = EasyNMT('opus-mt')

    for i, entry in enumerate(alljobs[:maxWrite]):
        entry["CLEANED_JOBS"] = textCleaner(entry["CONTENT"])

        # Seems to be a maximum number of characters allowed for translation
        # Split in half at a sentence boundary
        sentences = entry["CLEANED_JOBS"].split(". ")
        chunks = [". ".join(sentences)]
        entry["TRANSLATED_JOBS"] = ""
        if len(entry["CLEANED_JOBS"]) > 5000:
            mid = int(len(sentences) / 2)
            chunks = [". ".join(sentences[:mid]), ". ".join(sentences[mid:])]
        for chunk in chunks:
            if method == "google":
                try:
                    translate = modelGoogle.translate(chunk, dest="en")
                    translate = translate.text
                except (TypeError, AttributeError, IndexError,
                        ReadTimeout) as e:
                    print("Failed to translate entry", i)
                    print("Error was:", e)
                    if fallback:
                        print("Trying instead with EasyNMT")
                        translate = modelNMT.translate(chunk, target_lang="en")
                    else:
                        print("Leaving untranslated!")
                        translate = entry["CLEANED_JOBS"]
            else:
                translate = modelNMT.translate(chunk, target_lang="en")
            entry["TRANSLATED_JOBS"] += translate
        entry["TOKENIZED_JOBS"] = sentenceSplitter(entry["TRANSLATED_JOBS"])

    if outputFile:
        with open(outputFile, "w") as f:
            json.dump(alljobs[:maxWrite], f)

    return alljobs
Пример #2
0
def load_model(model_name='m2m_100_418M'):
    """
    EasyNMT model to load
    :param model_name: name of the model to load - List of supported model visit: https://github.com/UKPLab/EasyNMT#available-models 
    :return Machine translation model
    """
    
    return EasyNMT(model_name)
class SentenceSimilarity_translationBased(SentenceSimilarity_abstract):
    def __init__(self):
        self.vectorizer = Vectorizer()
        self.translationModel = EasyNMT('opus-mt')
        self.targetLanguage = "en"

    # this function computes the similarity between two sentences, the more similar the two snetends are the lower the
    # computed score is
    def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB):

        sourceLanguageA = self.translationModel.language_detection(sentenceA)
        translationsA = self.translationModel.translate(
            [sentenceA],
            source_lang=sourceLanguageA,
            target_lang=self.targetLanguage)

        sourceLanguageB = self.translationModel.language_detection(sentenceB)
        translationsB = self.translationModel.translate(
            [sentenceB],
            source_lang=sourceLanguageB,
            target_lang=self.targetLanguage)

        sentences = [translationsA[0], translationsB[0]]

        self.vectorizer.bert(sentences)
        vectors = self.vectorizer.vectors

        embeddingOf_sentenceA = vectors[0]
        embeddingOf_sentenceB = vectors[1]

        print("\nsentenceA \"" + sentenceA + "\" --- sourceLanguageA=" +
              sourceLanguageA + " --- translation = " + translationsA[0])
        print("sentenceB \"" + sentenceB + "\" --- sourceLanguageB=" +
              sourceLanguageB + " --- translation = " + translationsB[0])

        distance = spatial.distance.cosine(embeddingOf_sentenceA,
                                           embeddingOf_sentenceB)

        return distance
Пример #4
0
def translateParagraph(chunk, method, fallback=True):
    if method == "google":
        from googletrans import Translator
        modelGoogle = Translator()
    if method == "easynmt" or fallback:
        from easynmt import EasyNMT
        modelNMT = EasyNMT('opus-mt')

    if method == "google":
        try:
            translate = modelGoogle.translate(chunk, dest="en")
            translate = translate.text
        except (TypeError, AttributeError, IndexError, ReadTimeout) as e:
            print("Failed to translate entry")
            print("Error was:", e)
            if fallback:
                print("Trying instead with EasyNMT")
                translate = modelNMT.translate(chunk, target_lang="en")
            else:
                print("Leaving untranslated!")
                translate = chunk
    else:
        translate = modelNMT.translate(chunk, target_lang="en")
    return translate
Пример #5
0
First documented in the 13th century and at the crossing of two important historic trade routes,[11] Berlin became the capital of the Margraviate of Brandenburg (1417–1701), the Kingdom of Prussia (1701–1918), the German Empire (1871–1918), the Weimar Republic (1919–1933), and the Third Reich (1933–1945).[12] Berlin in the 1920s was the third-largest municipality in the world.[13] After World War II and its subsequent occupation by the victorious countries, the city was divided; West Berlin became a de facto West German exclave, surrounded by the Berlin Wall (1961–1989) and East German territory.[14] East Berlin was declared capital of East Germany, while Bonn became the West German capital. Following German reunification in 1990, Berlin once again became the capital of all of Germany.

Berlin is a world city of culture, politics, media and science.[15][16][17][18] Its economy is based on high-tech firms and the service sector, encompassing a diverse range of creative industries, research facilities, media corporations and convention venues.[19][20] Berlin serves as a continental hub for air and rail traffic and has a highly complex public transportation network. The metropolis is a popular tourist destination.[21] Significant industries also include IT, pharmaceuticals, biomedical engineering, clean tech, biotechnology, construction and electronics.""",
    """This is the second document. With some test sentences. 

This document contains also a list, which we want to translate.

This is a list:
    - Berlin is a city in Germany
    - Paris is the capital of France
    - London is a large city in England

This is my final sentence.""",
    """Mon document final est rédigé en français. Nous voulons donc vérifier si l'étape de détection de la langue fonctionne."""
]

from easynmt import EasyNMT

target_lang = 'de'  # We want to translate the sentences to German (de)

model = EasyNMT('opus-mt')

translations = model.translate(documents,
                               target_lang=target_lang,
                               batch_size=8,
                               beam_size=3)

for doc in translations:
    print(doc)
    print("\n======\n")
Пример #6
0
import csv
import sys
import time
import logging



if __name__ == '__main__':
    logging.basicConfig(
        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
        stream=sys.stdout,
    )

    model = EasyNMT(sys.argv[1])

    nli_dataset_path = 'AllNLI.tsv.gz'
    sentences = set()

    snli_max_sentences = 2000
    mnli_max_sentences = 2000
    snli = 0
    mnli = 0

    # Download datasets if needed
    if not os.path.exists(nli_dataset_path):
        util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

    with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
Пример #7
0
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

IS_BACKEND = os.getenv('ROLE', 'FRONT') == 'BACKEND'
BACKEND_URL = os.getenv('BACKEND_URL', 'http://*****:*****@app.get("/translate")
async def translate(target_lang: str,
                    text: List[str] = Query([]),
                    source_lang: Optional[str] = '',
                    beam_size: Optional[int] = 5,
                    perform_sentence_splitting: Optional[bool] = True):
    """
    Translation
    :param text: Text that should be translated
    :param target_lang: Target language
    :param source_lang: Language of text (optional)
    :param beam_size: Beam size
    :param perform_sentence_splitting: Split longer documents into individual sentences for translation
Пример #8
0
import shutil
import os
import re
from werkzeug.utils import secure_filename
from flask import Flask, flash, request, redirect, send_file, render_template, url_for
from nltk.tokenize import word_tokenize
from easynmt import EasyNMT
import nltk
import pprint
import secrets
import pyconll as pc
import torch
import gc

# model = EasyNMT('m2m_100_1.2B')
model = EasyNMT('opus-mt')
nltk.download('punkt')

torch.cuda.init()
"""
sentences = ['Dies ist ein Satz in Deutsch.',  # This is a German sentence
             '这是一个中文句子',  # This is a chinese sentence
             'Esta es una oración en español.']  # This is a spanish sentence

print(model.translate(sentences, target_lang='en', batch_size=1))
"""


def to_conllu(sentence):
    lines = sentence.split('\n')
    ok = ""
Пример #9
0
import csv
import sys
import time
import logging



if __name__ == '__main__':
    logging.basicConfig(
        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
        stream=sys.stdout,
    )

    model = EasyNMT(sys.argv[1])

    nli_dataset_path = 'AllNLI.tsv.gz'
    sentences = set()

    snli_max_sentences = 2000
    mnli_max_sentences = 2000
    snli = 0
    mnli = 0

    # Download datasets if needed
    if not os.path.exists(nli_dataset_path):
        util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

    with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
 def __init__(self):
     self.vectorizer = Vectorizer()
     self.translationModel = EasyNMT('opus-mt')
     self.targetLanguage = "en"
Пример #11
0
"""
This example tests the translation with all available models.
"""
from easynmt import EasyNMT

available_models = ['opus-mt', 'mbart50_m2m', 'm2m_100_418M', 'm2m_100_1.2B']

for model_name in available_models:
    print("\n\nLoad model:", model_name)
    model = EasyNMT(model_name)

    sentences = [
        'In dieser Liste definieren wir mehrere Sätze.',
        'Jeder dieser Sätze wird dann in die Zielsprache übersetzt.',
        'Puede especificar en esta lista la oración en varios idiomas.',
        'El sistema detectará automáticamente el idioma y utilizará el modelo correcto.'
    ]
    translations = model.translate(sentences, target_lang='en')

    print("Translations:")
    for sent, trans in zip(sentences, translations):
        print(sent)
        print("=>", trans, "\n")
            tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

with open(queries_filepath, 'r', encoding='utf8') as fIn:
    for line in fIn:
        qid, query = line.strip().split("\t")
        if qid in train_queries:
            train_queries[qid] = query.strip()

qids = [qid for qid in train_queries if train_queries[qid] is not None]
queries = [train_queries[qid] for qid in qids]

#Define our translation model
translation_model = EasyNMT('opus-mt')

print("Start translation of {} queries.".format(len(queries)))
print("This can take a while. But you can stop this script at any point")

with open(output_filename,
          'a' if os.path.exists(output_filename) else 'w',
          encoding='utf8') as fOut:
    for qid, query, translated_query in zip(
            qids, queries,
            translation_model.translate_stream(
                queries,
                source_lang='en',
                target_lang=target_lang,
                beam_size=2,
                perform_sentence_splitting=False,
Пример #13
0
"""
This script measures the translation speed.

Usage:
python translation_speed.py model_name
"""
import os
from easynmt import util, EasyNMT
import gzip
import csv
import sys
import time

model = EasyNMT(sys.argv[1])

nli_dataset_path = 'AllNLI.tsv.gz'
sentences = set()

snli_max_sentences = 1000
mnli_max_sentences = 1000
snli = 0
mnli = 0

#Download datasets if needed
if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)


with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
Пример #14
0
"""
This example shows how EasyNMT to stream translations.

Streaming translations can be useful when you want to translate a large set of sentences / documents.
The method chunks the data, translates it, and returns the results.

This can be useful if you want to write it e.g. to a file.
"""
from easynmt import EasyNMT

#First, we create a large set of sentences:
sentences = ['This is sentence ' + str(i) for i in range(10000)]

target_lang = 'de'  # We want to translate the sentences to German (de)

model = EasyNMT('opus-mt')

#The method model.translate_stream chunks the data into sets of size chunk_size
#It then translate these documents/sentences and yields the result
for translation in model.translate_stream(sentences,
                                          show_progress_bar=False,
                                          chunk_size=16,
                                          target_lang=target_lang):
    print(translation)