Пример #1
0
    def __init__(self, df, output_path, output_name, batch):
        # storing variables
        self.df = df
        self.filename = Path(output_path) / output_name
        self.raw_file = '{}_raw.csv'.format(self.filename)
        self.batch = batch

        # initialize tools
        self.translator = Translator()
        self.__initialize_senti()

        # collect jobs
        job_list = self.__collect_jobs()
        self.total_job = len(job_list)

        # initialize queues
        self.jobs = Queue(maxsize=self.total_job)
        for job in job_list:
            self.jobs.put(job)
        self.results = Queue(maxsize=self.total_job)

        # setup threading variables
        self.stop = threading.Event()
        self.worker_ct_lock = threading.Lock()
        self.worker_ct = 0  # num_of_spawned worker
Пример #2
0
def translate(text, src, dst):
    if not text or src == dst:
        return text
    translator = Translator()
    translator = Translator(service_urls=['translate.google.com'])
    try:
        # save hidden links
        links = reLink.findall(text)
        # replace emojis
        str_demoji = emoji.demojize(text, delimiters=emoji_delimiters)
        # translate
        tran = translator.translate(str_demoji, src=src, dest=dst).text
        # fix emojis
        for match in reEmoji.findall(tran):
            tran = tran.replace(match[0],
                                emoji.emojize(f":{match[1].lower()}:"))
        # fix hidden links
        offset = 0
        for i, match in enumerate(reLink.finditer(tran)):
            b, e = match.regs[0]
            insert = f"[{match[1]}]({links[i][1]})"
            tran = f"{tran[:b + offset]}{insert}{tran[e + offset:]}"
            offset += len(insert) - e + b
        return tran
    except HTTPException:
        return translate(text, src, dst)
    except:
        pass
    return text
Пример #3
0
def create_translate(nazwa_katalogu, nazwa_pliku):
    vtt = webvtt.read(f'{nazwa_katalogu}\{nazwa_pliku}')

    translator = Translator()

    list_sentence = []

    for element_in_vtt in vtt:
        list_sentence.append(element_in_vtt.text)

    translated_lines = []
    for line in list_sentence:
        new_line = line
        split_lines = line.split('.')
        for s_line in split_lines:
            if len(s_line) < 1:
                continue
            trans = translator.translate(s_line, src='en', dest='pl')
            print(f'{trans.origin} -> {trans.text}')
            new_line = new_line.replace(s_line.strip(),
                                        trans.text.replace('.', ''))
        translated_lines.append(new_line)

    for index, minute in enumerate(vtt):
        minute.text = translated_lines[index]
        vtt[index].text = minute.text

    print(vtt)

    nazwa_pliku = nazwa_pliku.split("(", maxsplit=1)[0]

    vtt.save(f'{nazwa_katalogu}\{nazwa_pliku}(pl)')
def do_single_translate(cn_text):
    """
    Do single text translate from CN to TW

    :param cn_text: Text in CN
    :return: Text in TW
    """
    translator = Translator()
    result = translator.translate(cn_text, src='zh-cn', dest='zh-tw')
    text = result.text

    return text
def do_translate(cn_texts):
    """
    Do translate from CN to TW
    
    :param cn_texts: Texts in CN
    :return: Texts in TW
    """
    translator = Translator()
    result = translator.translate(cn_texts, src='zh-cn', dest='zh-tw')
    tw_texts = [r.text for r in result]

    return tw_texts
Пример #6
0
async def translate(context):
    """ PagerMaid universal translator. """
    translator = Translator()
    reply = await context.get_reply_message()
    message = context.arguments
    ap_lang = config['application_language']
    if message:
        pass
    elif reply:
        message = reply.text
    else:
        await context.edit(lang('arg_error'))
        return

    try:
        if not silent:
            await context.edit(lang('translate_processing'))
        try:
            result = translator.translate(clear_emojis(message), dest=ap_lang)
        except:
            from translate import Translator as trans
            result = trans(to_lang=ap_lang.replace('zh-cn', 'zh')).translate(
                clear_emojis(message))
    except ValueError:
        await context.edit(lang('translate_ValueError'))
        return
    try:
        source_lang = result.src
        source_text = result.origin
        trans_lang = result.dest
    except AttributeError:
        await context.edit(lang('google_connection_error'))
        return
    result = f"**{lang('translate_hits')}**\n{lang('translate_original_lang')}: {source_lang}\n{source_text} -> {result.text}"

    if len(result) > 4096:
        await context.edit(lang('translate_tg_limit_uploading_file'))
        await attach_log(result, context.chat_id, "translation.txt",
                         context.id)
        return
    await context.edit(result)
    if len(result) <= 4096:
        await log(
            f"{lang('translate_get')}: `{source_text}` \n{lang('translate_from')} {source_lang} {lang('translate_to')} {trans_lang}"
        )
    else:
        await log(
            f"{lang('translate_get')}{translate('translate_from')} {source_lang} {lang('translate_to')} {trans_lang}."
        )
Пример #7
0
def transl_it(text, src, dest):
    try:
        # create a new object
        translator = Translator()
        # text is what you want to transalte, src the source language,
        # id est the language of the sentence, dest is the destination language
        a = translator.translate(text, src=src, dest=dest)
        # this is a manipulation to extract basic info from the output
        translated = (LANGUAGES[a.src], LANGUAGES[a.dest], a.text)
        return translated
    except Exception as ee:
        #print(ee)
        # this is the handling of the error, that bad input text can lead
        confidence = translator.detect(text)
        translated = (False, False,
                      "error: " + str(ee) + "\n" + str(confidence))
        return translated
Пример #8
0
def translate(word, language):
    translator = Translator()
    languages = translator.glanguage()
    keys = list(languages['sl'].keys())
    values = list(languages['sl'].values())
    location = -1
    for i in range(len(keys)):
        if language.lower() == keys[i].lower() or language.lower(
        ) == values[i].lower():
            location = i
    if location == -1:
        return ('Language entered is not supported')
    else:
        translation = str(translator.translate(word, dest=keys[location]))
        translation = translation[translation.find("text") +
                                  5:translation.find(', p')]
        return (f'{word} in {values[location]} => {translation}')
Пример #9
0
def start_translation():
    translator = Translator()
    languages_code_list = []
    log.info(f"Start of translation service")
    for language in language_list:
        languages_code_list.append(language["value"])

    default_language_string_list = list(resources[DEFAULT_LANGUAGE].values())

    for language in languages_code_list:
        if language != DEFAULT_LANGUAGE:
            start_time = time.time()
            translations = translator.translate(default_language_string_list,
                                                src=DEFAULT_LANGUAGE,
                                                dest=language)
            add_translations_to_resource_dict(translations, language)
            log.info(
                f"The {language} translation has taken {time.time()-start_time} seconds"
            )
            log.info(f"End of loading for resources for: {language}")
Пример #10
0
def test_japanese_punctuation():
  from pygoogletranslation import Translator
  t = Translator()
  assert t.translate("おはよう。げんきですか。").text == "Good morning. How are you." #<--- this is how google translate web translates it. Sadly, with the english-point replacement method, the result is different from this, as can be seen in the test.
  assert t.translate("今日、明日").text == "Today, tomorrow"
  # assert t.translate("「こんにちは」").text == "\"Hello\""
Пример #11
0
#-*-coding:utf-8 -*-

# import requests
# import execjs
# import json
from pygoogletranslation import Translator

# 实例化翻译器,由于模块默认的服务url在国内无法使用,所以我们修改成国内可用的google翻译服务地址
translator = Translator(service_url='translate.google.cn')
def googleTrans(text):
    try:
        # get url
        # responce
        result = translator.translate(text, dest='zh-CN')
        # r = requests.get(url)
        # 返回json格式的数据
        # data = json.loads(r.text)
        print(result.text)
        return result.text
    except Exception as e:
        print("出错了")
        print(e)


#使用谷歌翻译
#py的JS类
'''
class Py4Js():     
  def __init__(self):  
    self.ctx = execjs.compile(""" 
    function TL(a) { 
Пример #12
0
from django.conf import settings
import Levenshtein as lev
import sys
import openpyxl
from rest_framework.parsers import MultiPartParser
from django.db import IntegrityError

from django.core.files import File as _File
sys.path.append(os.path.join(settings.BASE_DIR, 'preprocessing_python'))
from preprocessor import *
import requests
import ast
import concurrent.futures
import urllib.request
from pygoogletranslation import Translator
translator = Translator()

from django.db import transaction

import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

BertSimilarityModel = ClinicalBertSimilarity(device='cpu', batch_size=10)


class TranslationMemoryViewSet(viewsets.ModelViewSet):
    queryset = TranslationMemory.objects.all().order_by('id')
    serializer_class = TranslationMemorySerializer

    def get_queryset(self):
Пример #13
0
 def translator(self):
     if not self._translator:
         self._translator = Translator(sleep=1)
     return self._translator
Пример #14
0
from pygoogletranslation import Translator

t = Translator()

print(t.translate("これはテストです", dest="en").text)
Пример #15
0
class Maestro:
    def __init__(self, df, output_path, output_name, batch):
        # storing variables
        self.df = df
        self.filename = Path(output_path) / output_name
        self.raw_file = '{}_raw.csv'.format(self.filename)
        self.batch = batch

        # initialize tools
        self.translator = Translator()
        self.__initialize_senti()

        # collect jobs
        job_list = self.__collect_jobs()
        self.total_job = len(job_list)

        # initialize queues
        self.jobs = Queue(maxsize=self.total_job)
        for job in job_list:
            self.jobs.put(job)
        self.results = Queue(maxsize=self.total_job)

        # setup threading variables
        self.stop = threading.Event()
        self.worker_ct_lock = threading.Lock()
        self.worker_ct = 0  # num_of_spawned worker

    def __initialize_senti(self):
        self.senti = PySentiStr()
        self.senti.setSentiStrengthPath(
            str(Path.cwd() / 'lib' / 'SentiStrengthCom.jar'))
        self.senti.setSentiStrengthLanguageFolderPath(str(Path.cwd() / 'lang'))

        # simple test to make sure senti works
        test = self.senti.getSentiment(['You are beautiful'], 'dual')
        assert type(test) is list
        assert type(test[0]) is tuple

    def __collect_jobs(self):
        try:
            out_df = pd.read_csv(self.raw_file, header=None)
            processed_ser = self.df['tweetid'].isin(out_df[1])
        except FileNotFoundError:
            zeros = np.zeros((len(self.df.index), ), dtype=bool)
            processed_ser = pd.Series(zeros)

        job_list = processed_ser[~processed_ser].index
        job_list = list(grouper(job_list, self.batch))
        if len(job_list) > 0:
            job_list[-1] = tuple(job for job in job_list[-1]
                                 if job is not None)

        return job_list

    def __despawn_worker(self):
        with self.worker_ct_lock:
            self.worker_ct = self.worker_ct - 1

    def __translate(self, thread_num):
        with self.worker_ct_lock:
            self.worker_ct = self.worker_ct + 1
        while not self.stop.is_set() and not self.jobs.empty():
            job = self.jobs.get()
            try:
                mini_df = self.df.loc[job, ]  # trailing comma is needed
                ids = mini_df.iloc[:, 0]
                items = mini_df.iloc[:, -1].to_numpy().tolist()
            except Exception as e:
                print('Worker #{} got pandas error: {}'.format(thread_num, e))
                break

            try:
                if len(items) == 1:
                    translations = [self.translator.translate(items)]
                else:
                    translations = self.translator.translate(items)
            except Exception as e:
                print('Worker #{} got translation error: {}'.format(
                    thread_num, e))
                break

            self.results.put((job, ids, translations))

        self.__despawn_worker()

    def __save(self, results):
        with open(self.raw_file, 'a', encoding='utf-8',
                  newline='') as csv_file:
            writer = csv.writer(csv_file,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerows(results)

    def __process(self, score='dual'):
        total_batch = int(np.ceil(len(self.df.index) / self.batch))
        pbar = tqdm(total=total_batch, initial=(total_batch - self.total_job))

        while not self.stop.is_set() or not self.results.empty():
            time.sleep(2)
            if not self.results.empty():
                # merges all results
                job_list, id_list, translation_list = ([], [], [])
                steps = 0
                while not self.results.empty():
                    job, ids, translations = self.results.get()
                    job_list.extend(job)
                    id_list.extend(ids)
                    translation_list.extend(translations)
                    steps = steps + 1

                # analyze sentiments
                texts = [tr.text for tr in translation_list]
                try:
                    sentis = self.senti.getSentiment(texts, score)
                except Exception as e:
                    print('Process got sentistrength error:', e)
                    break

                try:
                    rows = [
                        (order, i, *senti, tr.src, text)
                        for order, i, senti, tr, text in zip(
                            job_list, id_list, sentis, translation_list, texts)
                    ]
                except Exception as e:
                    print(e)
                    break

                try:
                    self.__save(rows)
                except Exception as e:
                    print('Process got on save error:', e)
                    break

                pbar.update(steps)
            time.sleep(.1)  # prevent too much loop checking

        if not self.stop.is_set():
            self.stop.set()  # force stop all threads

        print('Rebuilding...')
        self.__rebuild()

        print('Exiting...')
        pbar.close()

    def __rebuild(self):
        try:
            sf = pd.read_csv(self.raw_file,
                             header=None,
                             names=[
                                 'order', 'tweetid', '+', '-', 'src_lang',
                                 'translation'
                             ])
            sf.sort_values('order', inplace=True)
            sf.to_csv('{}.csv'.format(self.filename), index=None)
        except FileNotFoundError:
            pass
        except Exception as e:
            print(ERR_STR.format('rebuild', 'on rebuilding csv'), e)

    def play(self, n_thread=1):
        if n_thread < 1:
            return
        with ThreadPoolExecutor(max_workers=n_thread + 1) as executor:
            try:
                executor.map(self.__translate, range(n_thread))
                print('Spawing {} workers...'.format(n_thread))
                while self.worker_ct is 0:
                    pass  # waiting for any worker being spawned
                print('Aye, Sir!')
                executor.submit(self.__process)

                # as long as there are atleast a worker
                while self.worker_ct > 0:
                    # wait for any keyboard interrupt
                    time.sleep(.5)  # power napping for half second
                # either no job left or all worker has been despawned
                self.stop.set()

                if self.jobs.empty():
                    print('All done!')
                if self.worker_ct is 0:
                    print('All workers quit their job!')
            except KeyboardInterrupt:
                print('\nKeyboard interrupt')
            except Exception as e:
                print(ERR_STR.format('play', 'something went wrong'), e)
            finally:
                self.stop.set()

        print('Byee 👋')
Пример #16
0
from pygoogletranslation import Translator
import json
import sys
import time
import datetime
import time
from elasticsearch import Elasticsearch

# Command line argumments
# elastic_username = sys.argv[1:][0]
# elastic_password = sys.argv[2:][0]
# elastic_index = sys.argv[3:][0]
file = open("languages.txt", 'w')

translator = Translator()

# All language codes that both amazon and google support
countryCodes = [
    "af", "sq", "am", "ar", "hy", "az", "bn", "bs", "bg", "ca", "hr", "cs",
    "da", "nl", "en", "et", "fa", "tl", "fi", "fr", "ka", "de", "el", "gu",
    "ht", "ha", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko",
    "lv", "lt", "mk", "ms", "ml", "mt", "mn", "no", "fa", "ps", "pl", "pt",
    "ro", "ru", "sr", "si", "sk", "sl", "so", "es", "sw", "sv", "tl", "ta",
    "te", "th", "tr", "uk", "ur", "uz", "vi", "cy"
]

result_dict = dict()
for code in countryCodes:

    result_dict[code] = {}
Пример #17
0
from pygoogletranslation import Translator
import json
import sys
import time
import datetime
import time
from elasticsearch import Elasticsearch

# Command line argumments
elastic_username = sys.argv[1:][0]
elastic_password = sys.argv[2:][0]
elastic_index = sys.argv[3:][0]

translator = Translator()

# All language codes that both amazon and google support
countryCodes = [
    "af", "sq", "am", "ar", "hy", "az", "bn", "bs", "bg", "ca", "hr", "cs",
    "da", "nl", "en", "et", "fa", "tl", "fi", "fr", "ka", "de", "el", "gu",
    "ht", "ha", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko",
    "lv", "lt", "mk", "ms", "ml", "mt", "mn", "no", "fa", "ps", "pl", "pt",
    "ro", "ru", "sr", "si", "sk", "sl", "so", "es", "sw", "sv", "tl", "ta",
    "te", "th", "tr", "uk", "ur", "uz", "vi", "cy"
]

for code in countryCodes:

    result_dict = dict()

    result_dict['language'] = code