def add_dict(self, unit, source_lang, target_lang, commit=True): """inserts units represented as dictionaries in database""" source_lang = data.normalize_code(source_lang) target_lang = data.normalize_code(target_lang) try: try: self.cursor.execute( "INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)", (unit["source"], unit["context"], source_lang, len(unit["source"]))) sid = self.cursor.lastrowid except dbapi2.IntegrityError: # source string already exists in db, run query to find sid self.cursor.execute( "SELECT sid FROM sources WHERE text=? AND context=? and lang=?", (unit["source"], unit["context"], source_lang)) sid = self.cursor.fetchone() (sid, ) = sid try: # FIXME: get time info from translation store # FIXME: do we need so store target length? self.cursor.execute( "INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)", (sid, unit["target"], target_lang, int(time.time()))) except dbapi2.IntegrityError: # target string already exists in db, do nothing pass if commit: self.connection.commit() except: if commit: self.connection.rollback() raise
def _match_normalized_langcode(self, langcode): languages_keys = self.languages.keys() normalized_keys = [ data.normalize_code(lang) for lang in languages_keys ] i = normalized_keys.index(data.normalize_code(langcode)) return languages_keys[i]
def translate_unit(self, unit_source, source_langs, target_langs): """return TM suggestions for unit_source""" if isinstance(unit_source, str): unit_source = unicode(unit_source, "utf-8") if isinstance(source_langs, list): source_langs = [data.normalize_code(lang) for lang in source_langs] source_langs = ','.join(source_langs) else: source_langs = data.normalize_code(source_langs) if isinstance(target_langs, list): target_langs = [data.normalize_code(lang) for lang in target_langs] target_langs = ','.join(target_langs) else: target_langs = data.normalize_code(target_langs) minlen = min_levenshtein_length(len(unit_source), self.min_similarity) maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length) # split source into words, remove punctuation and special # chars, keep words that are at least 3 chars long unit_words = STRIP_REGEXP.sub(' ', unit_source).split() unit_words = filter(lambda word: len(word) > 2, unit_words) if self.fulltext and len(unit_words) > 3: logging.debug("fulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ? AND fulltext MATCH ?""" search_str = " OR ".join(unit_words) self.cursor.execute( query, (source_langs, target_langs, minlen, maxlen, search_str)) else: logging.debug("nonfulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length >= ? AND s.length <= ?""" self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen)) results = [] for row in self.cursor: quality = self.comparer.similarity(unit_source, row[0], self.min_similarity) if quality >= self.min_similarity: results.append({ 'source': row[0], 'target': row[1], 'context': row[2], 'quality': quality, }) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:self.max_candidates] logging.debug("results: %s", unicode(results)) return results
def translate_unit(self, unit_source, source_langs, target_langs): """return TM suggestions for unit_source""" if isinstance(unit_source, bytes): unit_source = unit_source.decode("utf-8") if isinstance(source_langs, list): source_langs = [data.normalize_code(lang) for lang in source_langs] source_langs = ','.join(source_langs) else: source_langs = data.normalize_code(source_langs) if isinstance(target_langs, list): target_langs = [data.normalize_code(lang) for lang in target_langs] target_langs = ','.join(target_langs) else: target_langs = data.normalize_code(target_langs) minlen = min_levenshtein_length(len(unit_source), self.min_similarity) maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length) # split source into words, remove punctuation and special # chars, keep words that are at least 3 chars long unit_words = STRIP_REGEXP.sub(' ', unit_source).split() unit_words = list(filter(lambda word: len(word) > 2, unit_words)) if self.fulltext and len(unit_words) > 3: logging.debug("fulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ? AND fulltext MATCH ?""" search_str = " OR ".join(unit_words) self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str)) else: logging.debug("nonfulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length >= ? AND s.length <= ?""" self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen)) results = [] for row in self.cursor: quality = self.comparer.similarity(unit_source, row[0], self.min_similarity) if quality >= self.min_similarity: results.append({ 'source': row[0], 'target': row[1], 'context': row[2], 'quality': quality, }) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:self.max_candidates] logging.debug("results: %s", six.text_type(results)) return results
def get_language_supported(lang_code, supported): normalized = data.normalize_code(data.simplify_to_common(lang_code)) if normalized in supported: return normalized # FIXME: horribly slow way of dealing with languages with @ in them for lang in supported.keys(): if normalized == data.normalize_code(lang): return lang return None
def get_alt_src_langs(request, profile, translation_project): language = translation_project.language project = translation_project.project source_language = project.source_language langs = profile.alt_src_langs.exclude(id__in=(language.id, source_language.id)).filter( translationproject__project=project ) if not profile.alt_src_langs.count(): from pootle_language.models import Language accept = request.META.get("HTTP_ACCEPT_LANGUAGE", "") for accept_lang, unused in parse_accept_lang_header(accept): if accept_lang == "*": continue normalized = to_locale(data.normalize_code(data.simplify_to_common(accept_lang))) code = to_locale(accept_lang) if normalized in ("en", "en_US", source_language.code, language.code) or code in ( "en", "en_US", source_language.code, language.code, ): continue langs = Language.objects.filter(code__in=(normalized, code), translationproject__project=project) if langs.count(): break return langs
def get_alt_src_langs(request, user, translation_project): if request.user.is_anonymous: return language = translation_project.language project = translation_project.project source_language = project.source_language langs = list( user.alt_src_langs.exclude( id__in=(language.id, source_language.id) ).filter( translationproject__project=project)) if langs: return langs accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '') for accept_lang, __ in parse_accept_lang_header(accept): if accept_lang == '*': continue normalized = to_locale( data.normalize_code( data.simplify_to_common(accept_lang))) code = to_locale(accept_lang) is_source_lang = any( langcode in ('en', 'en_US', source_language.code, language.code) for langcode in [code, normalized]) if is_source_lang: continue langs = list( Language.objects.filter( code__in=(normalized, code), translationproject__project=project)) if langs: return langs
def gettargetlanguage(self): """Get the target language for this .qph file. :return: ISO code e.g. af, fr, pt_BR :rtype: String """ return data.normalize_code(self.header.get('language'))
def get_alt_src_langs(request, user, translation_project): language = translation_project.language project = translation_project.project source_language = project.source_language langs = user.alt_src_langs.exclude( id__in=(language.id, source_language.id)).filter( translationproject__project=project) if not user.alt_src_langs.count(): from pootle_language.models import Language accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '') for accept_lang, __ in parse_accept_lang_header(accept): if accept_lang == '*': continue simplified = data.simplify_to_common(accept_lang) normalized = to_locale(data.normalize_code(simplified)) code = to_locale(accept_lang) if (normalized in ('en', 'en_US', source_language.code, language.code) or code in ('en', 'en_US', source_language.code, language.code)): continue langs = Language.objects.filter( code__in=(normalized, code), translationproject__project=project, ) if langs.count(): break return langs
def gettargetlanguage(self): """Get the target language for this .ts file. @return: ISO code e.g. af, fr, pt_BR @rtype: String """ return data.normalize_code(self.header.get("language"))
def get_alt_src_langs(request, user, translation_project): language = translation_project.language project = translation_project.project source_language = project.source_language langs = user.alt_src_langs.exclude( id__in=(language.id, source_language.id) ).filter(translationproject__project=project) if not user.alt_src_langs.count(): from pootle_language.models import Language accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '') for accept_lang, unused in parse_accept_lang_header(accept): if accept_lang == '*': continue simplified = data.simplify_to_common(accept_lang) normalized = to_locale(data.normalize_code(simplified)) code = to_locale(accept_lang) if (normalized in ('en', 'en_US', source_language.code, language.code) or code in ('en', 'en_US', source_language.code, language.code)): continue langs = Language.objects.filter( code__in=(normalized, code), translationproject__project=project, ) if langs.count(): break return langs
def get_lang_from_http_header(request, supported): """If the user's browser sends a list of preferred languages in the HTTP_ACCEPT_LANGUAGE header, parse it into a list. Then walk through the list, and for each entry, we check whether we have a matching pootle translation project. If so, we return it. If nothing is found, return None.""" accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '') for accept_lang, unused in trans_real.parse_accept_lang_header(accept): if accept_lang == '*': return None normalized = data.normalize_code(data.simplify_to_common(accept_lang, supported)) if normalized in ['en-us', 'en']: return None if normalized in supported: return normalized #FIXME: horribly slow way of dealing with languages with @ in them for lang in supported.keys(): if normalized == data.normalize_code(lang): return lang return None
def add_dict(self, unit, source_lang, target_lang, commit=True): """inserts units represented as dictionaries in database""" source_lang = data.normalize_code(source_lang) target_lang = data.normalize_code(target_lang) try: try: self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)", (unit["source"], unit["context"], source_lang, len(unit["source"]))) sid = self.cursor.lastrowid except dbapi2.IntegrityError: # source string already exists in db, run query to find sid self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?", (unit["source"], unit["context"], source_lang)) sid = self.cursor.fetchone() (sid,) = sid try: # FIXME: get time info from translation store # FIXME: do we need so store target length? self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)", (sid, unit["target"], target_lang, int(time.time()))) except dbapi2.IntegrityError: # target string already exists in db, do nothing pass if commit: self.connection.commit() except Exception: if commit: self.connection.rollback() raise
def getsourcelanguage(self): """Get the source language for this .qph file. We don't implement setsourcelanguage as users really shouldn't be altering the source language in .qph files, it should be set correctly by the extraction tools. :return: ISO code e.g. af, fr, pt_BR :rtype: String """ lang = data.normalize_code(self.header.get('sourcelanguage', "en")) if lang == 'en-us': return 'en' return lang
def get_alt_src_langs(request, profile, language): langs = profile.alt_src_langs.exclude(id=language.id) if not langs.count(): accept = request.META.get('HTTP_ACCEPT_LANGUAGE', '') codes = [] for accept_lang, unused in parse_accept_lang_header(accept): if accept_lang == '*': continue normalized = to_locale(data.normalize_code(data.simplify_to_common(accept_lang))) if normalized in ['en_US', 'en', language.code]: continue codes.append(normalized) if codes: from pootle_language.models import Language langs = Language.objects.filter(code__in=codes) return langs
def getsourcelanguage(self): """Get the source language for this .ts file. The 'sourcelanguage' attribute was only added to the TS format in Qt v4.5. We return 'en' if there is no sourcelanguage set. We don't implement setsourcelanguage as users really shouldn't be altering the source language in .ts files, it should be set correctly by the extraction tools. :return: ISO code e.g. af, fr, pt_BR :rtype: String """ lang = data.normalize_code(self.header.get('sourcelanguage', "en")) if lang == 'en-us': return 'en' return lang
def create_suggestions(self, suggestion): # Skip any suggestions where the suggested translation contains parenthesis if re.match(r'\(.*\)', suggestion['text']): return [] units = [] for proj in suggestion['projects']: # Skip fuzzy matches: if proj['flags'] != 0: continue source = proj['orig_phrase'].strip() # Skip strings that are too short if len(source) < MIN_TERM_LENGTH: continue # Skip any units containing parenthesis if re.match(r'\(.*\)', source): continue unit = TranslationUnit(source) target = suggestion['text'].strip() # Skip phrases already found: old_unit = self.store.findunit(proj['orig_phrase']) if old_unit and old_unit.target == target: continue # We mostly want to work with lowercase strings, but in German (and # some languages with a related writing style), this will probably # irritate more often than help, since nouns are always written to # start with capital letters. target_lang_code = self.main_controller.lang_controller.target_lang.code if not data.normalize_code(target_lang_code) in ( 'de', 'de-de', 'lb', 'als', 'ksh', 'stq', 'vmf'): # unless the string contains multiple consecutive uppercase # characters or using some type of camel case, we take it to # lower case if not is_case_sensitive(target): target = target.lower() unit.target = target units.append(unit) return units
def create_suggestions(self, suggestion): # Skip any suggestions where the suggested translation contains parenthesis if re.match(r"\(.*\)", suggestion["text"]): return [] units = [] for proj in suggestion["projects"]: # Skip fuzzy matches: if proj["flags"] != 0: continue source = proj["orig_phrase"].strip() # Skip strings that are too short if len(source) < MIN_TERM_LENGTH: continue # Skip any units containing parenthesis if re.match(r"\(.*\)", source): continue unit = TranslationUnit(source) target = suggestion["text"].strip() # Skip phrases already found: old_unit = self.store.findunit(proj["orig_phrase"]) if old_unit and old_unit.target == target: continue # We mostly want to work with lowercase strings, but in German (and # some languages with a related writing style), this will probably # irritate more often than help, since nouns are always written to # start with capital letters. target_lang_code = self.main_controller.lang_controller.target_lang.code if not data.normalize_code(target_lang_code) in ("de", "de-de", "lb", "als", "ksh", "stq", "vmf"): # unless the string contains multiple consecutive uppercase # characters or using some type of camel case, we take it to # lower case if not is_case_sensitive(target): target = target.lower() unit.target = target units.append(unit) return units
def test_normalise_code(): """test the normalisation of language codes""" assert data.normalize_code("af_ZA") == "af-za" assert data.normalize_code("xx@Latin") == "xx-latin"
def _match_normalized_langcode(self, langcode): languages_keys = self.languages.keys() normalized_keys = [data.normalize_code(lang) for lang in languages_keys] i = normalized_keys.index(data.normalize_code(langcode)) return languages_keys[i]