def __init__(self, df, output_path, output_name, batch): # storing variables self.df = df self.filename = Path(output_path) / output_name self.raw_file = '{}_raw.csv'.format(self.filename) self.batch = batch # initialize tools self.translator = Translator() self.__initialize_senti() # collect jobs job_list = self.__collect_jobs() self.total_job = len(job_list) # initialize queues self.jobs = Queue(maxsize=self.total_job) for job in job_list: self.jobs.put(job) self.results = Queue(maxsize=self.total_job) # setup threading variables self.stop = threading.Event() self.worker_ct_lock = threading.Lock() self.worker_ct = 0 # num_of_spawned worker
def translate(text, src, dst): if not text or src == dst: return text translator = Translator() translator = Translator(service_urls=['translate.google.com']) try: # save hidden links links = reLink.findall(text) # replace emojis str_demoji = emoji.demojize(text, delimiters=emoji_delimiters) # translate tran = translator.translate(str_demoji, src=src, dest=dst).text # fix emojis for match in reEmoji.findall(tran): tran = tran.replace(match[0], emoji.emojize(f":{match[1].lower()}:")) # fix hidden links offset = 0 for i, match in enumerate(reLink.finditer(tran)): b, e = match.regs[0] insert = f"[{match[1]}]({links[i][1]})" tran = f"{tran[:b + offset]}{insert}{tran[e + offset:]}" offset += len(insert) - e + b return tran except HTTPException: return translate(text, src, dst) except: pass return text
def create_translate(nazwa_katalogu, nazwa_pliku): vtt = webvtt.read(f'{nazwa_katalogu}\{nazwa_pliku}') translator = Translator() list_sentence = [] for element_in_vtt in vtt: list_sentence.append(element_in_vtt.text) translated_lines = [] for line in list_sentence: new_line = line split_lines = line.split('.') for s_line in split_lines: if len(s_line) < 1: continue trans = translator.translate(s_line, src='en', dest='pl') print(f'{trans.origin} -> {trans.text}') new_line = new_line.replace(s_line.strip(), trans.text.replace('.', '')) translated_lines.append(new_line) for index, minute in enumerate(vtt): minute.text = translated_lines[index] vtt[index].text = minute.text print(vtt) nazwa_pliku = nazwa_pliku.split("(", maxsplit=1)[0] vtt.save(f'{nazwa_katalogu}\{nazwa_pliku}(pl)')
def do_single_translate(cn_text): """ Do single text translate from CN to TW :param cn_text: Text in CN :return: Text in TW """ translator = Translator() result = translator.translate(cn_text, src='zh-cn', dest='zh-tw') text = result.text return text
def do_translate(cn_texts): """ Do translate from CN to TW :param cn_texts: Texts in CN :return: Texts in TW """ translator = Translator() result = translator.translate(cn_texts, src='zh-cn', dest='zh-tw') tw_texts = [r.text for r in result] return tw_texts
async def translate(context): """ PagerMaid universal translator. """ translator = Translator() reply = await context.get_reply_message() message = context.arguments ap_lang = config['application_language'] if message: pass elif reply: message = reply.text else: await context.edit(lang('arg_error')) return try: if not silent: await context.edit(lang('translate_processing')) try: result = translator.translate(clear_emojis(message), dest=ap_lang) except: from translate import Translator as trans result = trans(to_lang=ap_lang.replace('zh-cn', 'zh')).translate( clear_emojis(message)) except ValueError: await context.edit(lang('translate_ValueError')) return try: source_lang = result.src source_text = result.origin trans_lang = result.dest except AttributeError: await context.edit(lang('google_connection_error')) return result = f"**{lang('translate_hits')}**\n{lang('translate_original_lang')}: {source_lang}\n{source_text} -> {result.text}" if len(result) > 4096: await context.edit(lang('translate_tg_limit_uploading_file')) await attach_log(result, context.chat_id, "translation.txt", context.id) return await context.edit(result) if len(result) <= 4096: await log( f"{lang('translate_get')}: `{source_text}` \n{lang('translate_from')} {source_lang} {lang('translate_to')} {trans_lang}" ) else: await log( f"{lang('translate_get')}{translate('translate_from')} {source_lang} {lang('translate_to')} {trans_lang}." )
def transl_it(text, src, dest): try: # create a new object translator = Translator() # text is what you want to transalte, src the source language, # id est the language of the sentence, dest is the destination language a = translator.translate(text, src=src, dest=dest) # this is a manipulation to extract basic info from the output translated = (LANGUAGES[a.src], LANGUAGES[a.dest], a.text) return translated except Exception as ee: #print(ee) # this is the handling of the error, that bad input text can lead confidence = translator.detect(text) translated = (False, False, "error: " + str(ee) + "\n" + str(confidence)) return translated
def translate(word, language): translator = Translator() languages = translator.glanguage() keys = list(languages['sl'].keys()) values = list(languages['sl'].values()) location = -1 for i in range(len(keys)): if language.lower() == keys[i].lower() or language.lower( ) == values[i].lower(): location = i if location == -1: return ('Language entered is not supported') else: translation = str(translator.translate(word, dest=keys[location])) translation = translation[translation.find("text") + 5:translation.find(', p')] return (f'{word} in {values[location]} => {translation}')
def start_translation(): translator = Translator() languages_code_list = [] log.info(f"Start of translation service") for language in language_list: languages_code_list.append(language["value"]) default_language_string_list = list(resources[DEFAULT_LANGUAGE].values()) for language in languages_code_list: if language != DEFAULT_LANGUAGE: start_time = time.time() translations = translator.translate(default_language_string_list, src=DEFAULT_LANGUAGE, dest=language) add_translations_to_resource_dict(translations, language) log.info( f"The {language} translation has taken {time.time()-start_time} seconds" ) log.info(f"End of loading for resources for: {language}")
def test_japanese_punctuation(): from pygoogletranslation import Translator t = Translator() assert t.translate("おはよう。げんきですか。").text == "Good morning. How are you." #<--- this is how google translate web translates it. Sadly, with the english-point replacement method, the result is different from this, as can be seen in the test. assert t.translate("今日、明日").text == "Today, tomorrow" # assert t.translate("「こんにちは」").text == "\"Hello\""
#-*-coding:utf-8 -*- # import requests # import execjs # import json from pygoogletranslation import Translator # 实例化翻译器,由于模块默认的服务url在国内无法使用,所以我们修改成国内可用的google翻译服务地址 translator = Translator(service_url='translate.google.cn') def googleTrans(text): try: # get url # responce result = translator.translate(text, dest='zh-CN') # r = requests.get(url) # 返回json格式的数据 # data = json.loads(r.text) print(result.text) return result.text except Exception as e: print("出错了") print(e) #使用谷歌翻译 #py的JS类 ''' class Py4Js(): def __init__(self): self.ctx = execjs.compile(""" function TL(a) {
from django.conf import settings import Levenshtein as lev import sys import openpyxl from rest_framework.parsers import MultiPartParser from django.db import IntegrityError from django.core.files import File as _File sys.path.append(os.path.join(settings.BASE_DIR, 'preprocessing_python')) from preprocessor import * import requests import ast import concurrent.futures import urllib.request from pygoogletranslation import Translator translator = Translator() from django.db import transaction import ebooklib from ebooklib import epub from bs4 import BeautifulSoup BertSimilarityModel = ClinicalBertSimilarity(device='cpu', batch_size=10) class TranslationMemoryViewSet(viewsets.ModelViewSet): queryset = TranslationMemory.objects.all().order_by('id') serializer_class = TranslationMemorySerializer def get_queryset(self):
def translator(self): if not self._translator: self._translator = Translator(sleep=1) return self._translator
from pygoogletranslation import Translator t = Translator() print(t.translate("これはテストです", dest="en").text)
class Maestro: def __init__(self, df, output_path, output_name, batch): # storing variables self.df = df self.filename = Path(output_path) / output_name self.raw_file = '{}_raw.csv'.format(self.filename) self.batch = batch # initialize tools self.translator = Translator() self.__initialize_senti() # collect jobs job_list = self.__collect_jobs() self.total_job = len(job_list) # initialize queues self.jobs = Queue(maxsize=self.total_job) for job in job_list: self.jobs.put(job) self.results = Queue(maxsize=self.total_job) # setup threading variables self.stop = threading.Event() self.worker_ct_lock = threading.Lock() self.worker_ct = 0 # num_of_spawned worker def __initialize_senti(self): self.senti = PySentiStr() self.senti.setSentiStrengthPath( str(Path.cwd() / 'lib' / 'SentiStrengthCom.jar')) self.senti.setSentiStrengthLanguageFolderPath(str(Path.cwd() / 'lang')) # simple test to make sure senti works test = self.senti.getSentiment(['You are beautiful'], 'dual') assert type(test) is list assert type(test[0]) is tuple def __collect_jobs(self): try: out_df = pd.read_csv(self.raw_file, header=None) processed_ser = self.df['tweetid'].isin(out_df[1]) except FileNotFoundError: zeros = np.zeros((len(self.df.index), ), dtype=bool) processed_ser = pd.Series(zeros) job_list = processed_ser[~processed_ser].index job_list = list(grouper(job_list, self.batch)) if len(job_list) > 0: job_list[-1] = tuple(job for job in job_list[-1] if job is not None) return job_list def __despawn_worker(self): with self.worker_ct_lock: self.worker_ct = self.worker_ct - 1 def __translate(self, thread_num): with self.worker_ct_lock: self.worker_ct = self.worker_ct + 1 while not self.stop.is_set() and not self.jobs.empty(): job = self.jobs.get() try: mini_df = self.df.loc[job, ] # trailing comma is needed ids = mini_df.iloc[:, 0] items = mini_df.iloc[:, -1].to_numpy().tolist() except Exception as e: print('Worker #{} got pandas error: {}'.format(thread_num, e)) break try: if len(items) == 1: translations = [self.translator.translate(items)] else: translations = self.translator.translate(items) except Exception as e: print('Worker #{} got translation error: {}'.format( thread_num, e)) break self.results.put((job, ids, translations)) self.__despawn_worker() def __save(self, results): with open(self.raw_file, 'a', encoding='utf-8', newline='') as csv_file: writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerows(results) def __process(self, score='dual'): total_batch = int(np.ceil(len(self.df.index) / self.batch)) pbar = tqdm(total=total_batch, initial=(total_batch - self.total_job)) while not self.stop.is_set() or not self.results.empty(): time.sleep(2) if not self.results.empty(): # merges all results job_list, id_list, translation_list = ([], [], []) steps = 0 while not self.results.empty(): job, ids, translations = self.results.get() job_list.extend(job) id_list.extend(ids) translation_list.extend(translations) steps = steps + 1 # analyze sentiments texts = [tr.text for tr in translation_list] try: sentis = self.senti.getSentiment(texts, score) except Exception as e: print('Process got sentistrength error:', e) break try: rows = [ (order, i, *senti, tr.src, text) for order, i, senti, tr, text in zip( job_list, id_list, sentis, translation_list, texts) ] except Exception as e: print(e) break try: self.__save(rows) except Exception as e: print('Process got on save error:', e) break pbar.update(steps) time.sleep(.1) # prevent too much loop checking if not self.stop.is_set(): self.stop.set() # force stop all threads print('Rebuilding...') self.__rebuild() print('Exiting...') pbar.close() def __rebuild(self): try: sf = pd.read_csv(self.raw_file, header=None, names=[ 'order', 'tweetid', '+', '-', 'src_lang', 'translation' ]) sf.sort_values('order', inplace=True) sf.to_csv('{}.csv'.format(self.filename), index=None) except FileNotFoundError: pass except Exception as e: print(ERR_STR.format('rebuild', 'on rebuilding csv'), e) def play(self, n_thread=1): if n_thread < 1: return with ThreadPoolExecutor(max_workers=n_thread + 1) as executor: try: executor.map(self.__translate, range(n_thread)) print('Spawing {} workers...'.format(n_thread)) while self.worker_ct is 0: pass # waiting for any worker being spawned print('Aye, Sir!') executor.submit(self.__process) # as long as there are atleast a worker while self.worker_ct > 0: # wait for any keyboard interrupt time.sleep(.5) # power napping for half second # either no job left or all worker has been despawned self.stop.set() if self.jobs.empty(): print('All done!') if self.worker_ct is 0: print('All workers quit their job!') except KeyboardInterrupt: print('\nKeyboard interrupt') except Exception as e: print(ERR_STR.format('play', 'something went wrong'), e) finally: self.stop.set() print('Byee 👋')
from pygoogletranslation import Translator import json import sys import time import datetime import time from elasticsearch import Elasticsearch # Command line argumments # elastic_username = sys.argv[1:][0] # elastic_password = sys.argv[2:][0] # elastic_index = sys.argv[3:][0] file = open("languages.txt", 'w') translator = Translator() # All language codes that both amazon and google support countryCodes = [ "af", "sq", "am", "ar", "hy", "az", "bn", "bs", "bg", "ca", "hr", "cs", "da", "nl", "en", "et", "fa", "tl", "fi", "fr", "ka", "de", "el", "gu", "ht", "ha", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "ml", "mt", "mn", "no", "fa", "ps", "pl", "pt", "ro", "ru", "sr", "si", "sk", "sl", "so", "es", "sw", "sv", "tl", "ta", "te", "th", "tr", "uk", "ur", "uz", "vi", "cy" ] result_dict = dict() for code in countryCodes: result_dict[code] = {}
from pygoogletranslation import Translator import json import sys import time import datetime import time from elasticsearch import Elasticsearch # Command line argumments elastic_username = sys.argv[1:][0] elastic_password = sys.argv[2:][0] elastic_index = sys.argv[3:][0] translator = Translator() # All language codes that both amazon and google support countryCodes = [ "af", "sq", "am", "ar", "hy", "az", "bn", "bs", "bg", "ca", "hr", "cs", "da", "nl", "en", "et", "fa", "tl", "fi", "fr", "ka", "de", "el", "gu", "ht", "ha", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "ml", "mt", "mn", "no", "fa", "ps", "pl", "pt", "ro", "ru", "sr", "si", "sk", "sl", "so", "es", "sw", "sv", "tl", "ta", "te", "th", "tr", "uk", "ur", "uz", "vi", "cy" ] for code in countryCodes: result_dict = dict() result_dict['language'] = code