Python EasyNMT.translate示例，easynmt.EasyNMT.translate Python示例

示例#1

0

显示文件

def addTranslation(alljobs,
                   outputFile,
                   method="google",
                   fallback=True,
                   maxWrite=-1):
    if method == "google":
        from googletrans import Translator
        modelGoogle = Translator()
    if method == "easynmt" or fallback:
        from easynmt import EasyNMT
        modelNMT = EasyNMT('opus-mt')

    for i, entry in enumerate(alljobs[:maxWrite]):
        entry["CLEANED_JOBS"] = textCleaner(entry["CONTENT"])

        # Seems to be a maximum number of characters allowed for translation
        # Split in half at a sentence boundary
        sentences = entry["CLEANED_JOBS"].split(". ")
        chunks = [". ".join(sentences)]
        entry["TRANSLATED_JOBS"] = ""
        if len(entry["CLEANED_JOBS"]) > 5000:
            mid = int(len(sentences) / 2)
            chunks = [". ".join(sentences[:mid]), ". ".join(sentences[mid:])]
        for chunk in chunks:
            if method == "google":
                try:
                    translate = modelGoogle.translate(chunk, dest="en")
                    translate = translate.text
                except (TypeError, AttributeError, IndexError,
                        ReadTimeout) as e:
                    print("Failed to translate entry", i)
                    print("Error was:", e)
                    if fallback:
                        print("Trying instead with EasyNMT")
                        translate = modelNMT.translate(chunk, target_lang="en")
                    else:
                        print("Leaving untranslated!")
                        translate = entry["CLEANED_JOBS"]
            else:
                translate = modelNMT.translate(chunk, target_lang="en")
            entry["TRANSLATED_JOBS"] += translate
        entry["TOKENIZED_JOBS"] = sentenceSplitter(entry["TRANSLATED_JOBS"])

    if outputFile:
        with open(outputFile, "w") as f:
            json.dump(alljobs[:maxWrite], f)

    return alljobs

示例#2

0

显示文件

文件： sentenceSimilarityMethods.py 项目： emilysharata/Thesis

class SentenceSimilarity_translationBased(SentenceSimilarity_abstract):
    def __init__(self):
        self.vectorizer = Vectorizer()
        self.translationModel = EasyNMT('opus-mt')
        self.targetLanguage = "en"

    # this function computes the similarity between two sentences, the more similar the two snetends are the lower the
    # computed score is
    def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB):

        sourceLanguageA = self.translationModel.language_detection(sentenceA)
        translationsA = self.translationModel.translate(
            [sentenceA],
            source_lang=sourceLanguageA,
            target_lang=self.targetLanguage)

        sourceLanguageB = self.translationModel.language_detection(sentenceB)
        translationsB = self.translationModel.translate(
            [sentenceB],
            source_lang=sourceLanguageB,
            target_lang=self.targetLanguage)

        sentences = [translationsA[0], translationsB[0]]

        self.vectorizer.bert(sentences)
        vectors = self.vectorizer.vectors

        embeddingOf_sentenceA = vectors[0]
        embeddingOf_sentenceB = vectors[1]

        print("\nsentenceA \"" + sentenceA + "\" --- sourceLanguageA=" +
              sourceLanguageA + " --- translation = " + translationsA[0])
        print("sentenceB \"" + sentenceB + "\" --- sourceLanguageB=" +
              sourceLanguageB + " --- translation = " + translationsB[0])

        distance = spatial.distance.cosine(embeddingOf_sentenceA,
                                           embeddingOf_sentenceB)

        return distance

示例#3

0

显示文件

def translateParagraph(chunk, method, fallback=True):
    if method == "google":
        from googletrans import Translator
        modelGoogle = Translator()
    if method == "easynmt" or fallback:
        from easynmt import EasyNMT
        modelNMT = EasyNMT('opus-mt')

    if method == "google":
        try:
            translate = modelGoogle.translate(chunk, dest="en")
            translate = translate.text
        except (TypeError, AttributeError, IndexError, ReadTimeout) as e:
            print("Failed to translate entry")
            print("Error was:", e)
            if fallback:
                print("Trying instead with EasyNMT")
                translate = modelNMT.translate(chunk, target_lang="en")
            else:
                print("Leaving untranslated!")
                translate = chunk
    else:
        translate = modelNMT.translate(chunk, target_lang="en")
    return translate

示例#4

0

显示文件

First documented in the 13th century and at the crossing of two important historic trade routes,[11] Berlin became the capital of the Margraviate of Brandenburg (1417–1701), the Kingdom of Prussia (1701–1918), the German Empire (1871–1918), the Weimar Republic (1919–1933), and the Third Reich (1933–1945).[12] Berlin in the 1920s was the third-largest municipality in the world.[13] After World War II and its subsequent occupation by the victorious countries, the city was divided; West Berlin became a de facto West German exclave, surrounded by the Berlin Wall (1961–1989) and East German territory.[14] East Berlin was declared capital of East Germany, while Bonn became the West German capital. Following German reunification in 1990, Berlin once again became the capital of all of Germany.

Berlin is a world city of culture, politics, media and science.[15][16][17][18] Its economy is based on high-tech firms and the service sector, encompassing a diverse range of creative industries, research facilities, media corporations and convention venues.[19][20] Berlin serves as a continental hub for air and rail traffic and has a highly complex public transportation network. The metropolis is a popular tourist destination.[21] Significant industries also include IT, pharmaceuticals, biomedical engineering, clean tech, biotechnology, construction and electronics.""",
    """This is the second document. With some test sentences. 

This document contains also a list, which we want to translate.

This is a list:
    - Berlin is a city in Germany
    - Paris is the capital of France
    - London is a large city in England

This is my final sentence.""",
    """Mon document final est rédigé en français. Nous voulons donc vérifier si l'étape de détection de la langue fonctionne."""
]

from easynmt import EasyNMT

target_lang = 'de'  # We want to translate the sentences to German (de)

model = EasyNMT('opus-mt')

translations = model.translate(documents,
                               target_lang=target_lang,
                               batch_size=8,
                               beam_size=3)

for doc in translations:
    print(doc)
    print("\n======\n")

示例#5

0

显示文件

            if row['dataset'] == 'SNLI' and snli < snli_max_sentences:
                sentences.add(row['sentence1'])
                snli += 1
            if row['dataset'] == 'MNLI' and mnli < mnli_max_sentences:
                sentences.add(row['sentence1'])
                mnli += 1
            if snli >= snli_max_sentences and mnli >= mnli_max_sentences:
                break

    print("Sentences:", len(sentences))
    sentences = list(sentences)


    # Some warm up
    model.translate(sentences[0:100], source_lang='en', target_lang='de')


    # Start translation speed measure - Single process
    start_time = time.time()
    step_size = 4000
    for start_idx in range(0, len(sentences), step_size):
        translations_single_p = model.translate(sentences[start_idx:start_idx+step_size], source_lang='en', target_lang='de', show_progress_bar=True)
    end_time = time.time()
    print("Single-Process translation done after {:.2f} sec. {:.2f} sentences / second".format(end_time - start_time, len(sentences) / (end_time - start_time)))




    ######## Multi-Process-Translation
    del model

示例#6

0

显示文件

文件： test_all_models.py 项目： yyht/EasyNMT

"""
This example tests the translation with all available models.
"""
from easynmt import EasyNMT

available_models = ['opus-mt', 'mbart50_m2m', 'm2m_100_418M', 'm2m_100_1.2B']

for model_name in available_models:
    print("\n\nLoad model:", model_name)
    model = EasyNMT(model_name)

    sentences = [
        'In dieser Liste definieren wir mehrere Sätze.',
        'Jeder dieser Sätze wird dann in die Zielsprache übersetzt.',
        'Puede especificar en esta lista la oración en varios idiomas.',
        'El sistema detectará automáticamente el idioma y utilizará el modelo correcto.'
    ]
    translations = model.translate(sentences, target_lang='en')

    print("Translations:")
    for sent, trans in zip(sentences, translations):
        print(sent)
        print("=>", trans, "\n")

示例#7

0

显示文件

"""
This example shows how EasyNMT can be used for sentence translation
"""
from easynmt import EasyNMT

sentences = ['Voici un exemple d\'utilisation d\'EasyNMT.', #'This is an example how to use EasyNMT.',
             'You can define a list of sentences.',
             'Cada frase es luego traducida al idioma de destino seleccionado.', #'Each sentences is then translated to your chosen target language.',
             'On our website, you can find various translation models.',
             'New York City (NYC), often called simply New York, is the most populous city in the United States.',
             'PyTorch is an open source machine learning library based on the Torch library, used for applications such as computer vision and natural language processing, primarily developed by Facebook\'s AI Research lab (FAIR).',
             'A deep neural network (DNN) is an artificial neural network (ANN) with multiple layers between the input and output layers.']


target_lang = 'de'      # We want to translate the sentences to German (de)

model = EasyNMT('opus-mt')

translations = model.translate(sentences, target_lang=target_lang, batch_size=8, beam_size=3)
print(translations)

示例#8

0

显示文件

with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['sentence1'] in sentences or len(row['sentence1']) > 200:
            continue

        if len(model.sentence_splitting(row['sentence1'])) > 1:
            continue

        if row['dataset'] == 'SNLI' and snli < snli_max_sentences:
            sentences.add(row['sentence1'])
            snli += 1
        if row['dataset'] == 'MNLI' and mnli < mnli_max_sentences:
            sentences.add(row['sentence1'])
            mnli += 1
        if snli >= snli_max_sentences and mnli >= mnli_max_sentences:
            break

print("Sentences:", len(sentences))
sentences = list(sentences)

    
#Some warm up
model.translate(sentences[0:100], source_lang='en', target_lang='de', perform_sentence_splitting=False)

#Start translation speed measure
start_time = time.time()
model.translate(sentences, source_lang='en', target_lang='de', batch_size=64, show_progress_bar=True, perform_sentence_splitting=False)
end_time = time.time()
print("Done after {:.2f} sec. {:.2f} sentences / second".format(end_time-start_time, len(sentences) / (end_time-start_time)))