def translate_unit(self, unit_source, source_lang, target_lang, project_style=None, min_similarity=None, max_candidates=None): """Return TM suggestions for unit_source.""" slang = lang_to_table(source_lang) if slang not in self.source_langs: abort(404) tlang = lang_to_table(target_lang) if slang == tlang: # We really don't want to serve en->en requests. abort(404) if isinstance(unit_source, bytes): unit_source = unicode(unit_source, "utf-8") checker = project_checker(project_style, source_lang) max_length = current_app.config.get('MAX_LENGTH', 2000) min_similarity = max( min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 70) max_candidates = min( max_candidates or current_app.config.get('MAX_CANDIDATES', 5), 30) source_len = len(unit_source) minlen = min_levenshtein_length(source_len, min_similarity) maxlen = max_levenshtein_length(source_len, min_similarity, max_length) minrank = max(min_similarity / 2, 30) cursor = self.get_cursor(slang) try: self._translate_query(cursor, slang, tlang, indexing_version(unit_source, checker), minlen, maxlen, minrank) except postgres.psycopg2.ProgrammingError: # Avoid problems parsing strings like '<a "\b">'. If any of the # characters in the example string is not present, then no error is # thrown. The error is still present if any number of other letters # are included between any of the characters in the example string. cursor.connection.rollback() return [] results = [] similarity = self.comparer.similarity for row in cursor: quality = similarity(unit_source, row['source'], min_similarity) if quality >= min_similarity: result = dict(row) result['quality'] = quality results.append(result) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:max_candidates] return results
def translate_unit(self, unit_source, source_lang, target_lang, project_style=None, min_similarity=None, max_candidates=None): """return TM suggestions for unit_source""" slang = lang_to_table(source_lang) if slang not in self.source_langs: abort(404) tlang = lang_to_table(target_lang) lang_config = lang_to_config(slang) if slang == tlang: # We really don't want to serve en->en requests abort(404) if isinstance(unit_source, str): unit_source = unicode(unit_source, "utf-8") checker = project_checker(project_style, source_lang) max_length = current_app.config.get('MAX_LENGTH', 1000) min_similarity = max(min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30) max_candidates = max_candidates or current_app.config.get('MAX_CANDIDATES', 5) source_len = len(unit_source) minlen = min_levenshtein_length(source_len, min_similarity) maxlen = max_levenshtein_length(source_len, min_similarity, max_length) minrank = max(min_similarity / 2, 30) cursor = self.get_cursor() query = """ SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid, TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s AND s.vector @@ query) sub WHERE rank > %%(minrank)s ORDER BY rank DESC """ % (slang, slang) cursor.execute(query, {'search_str': indexing_version(unit_source, checker), 'tlang': tlang, 'lang_config': lang_config, 'minrank': minrank, 'minlen': minlen, 'maxlen': maxlen}) results = [] similarity = self.comparer.similarity for row in cursor: quality = similarity(unit_source, row['source'], min_similarity) if quality >= min_similarity: result = dict(row) result['quality'] = quality results.append(result) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:max_candidates] return results
def get_all_sids(self, units, source_lang, project_style): """Ensure that all source strings are in the database+cache.""" all_sources = set(u['source'] for u in units) d = current_app.cache.get_dict(*( build_cache_key(k, source_lang) for k in all_sources )) # Filter out None results (keys not found). already_cached = set(filter(lambda x: d[x] is not None, d)) # Unmangle the key to get a source string. # TODO: update for memcached already_cached = set(split_cache_key(k) for k in already_cached) uncached = tuple(all_sources - already_cached) if not uncached: # Everything is already cached. return checker = project_checker(project_style, source_lang) cursor = self.get_cursor() select_query = """SELECT text, sid FROM sources_%s WHERE text IN %%(list)s""" % source_lang to_store = set() already_stored = {} for i in range(1, 4): # During parallel import, another process could have INSERTed a # record just after we SELECTed and just before we INSERTed, # causing a duplicate key. So let's expect that and retry a few # times before we give up: try: cursor.execute(select_query, {"list": uncached}) already_stored = dict(cursor.fetchall()) to_store = all_sources - already_cached - set(already_stored) if not to_store: # Note that we could technically leak the savepoint # "before_sids" (below) if this is not the first iteration # of the loop. It shouldn't matter, and will be destroyed # when we commit anyway. break # Some source strings still need to be stored. insert_query = """INSERT INTO sources_%s (text, vector, length) VALUES( %%(source)s, TO_TSVECTOR(%%(lang_config)s, %%(indexed_source)s), %%(length)s ) RETURNING sid""" % source_lang lang_config = lang_to_config(source_lang) params = [{ "lang_config": lang_config, "source": s, "indexed_source": indexing_version(s, checker), "length": len(s), } for s in to_store ] # We sort to avoid deadlocks during parallel import. params.sort(key=lambda x: x['source']) cursor.execute("SAVEPOINT before_sids") cursor.executemany(insert_query, params) cursor.execute("RELEASE SAVEPOINT before_sids") except postgres.psycopg2.IntegrityError: cursor.execute("ROLLBACK TO SAVEPOINT before_sids") else: # No exception means we can break the retry loop. break else: raise Exception("Failed 3 times to import sources") if to_store: # get the inserted rows back so that we have their IDs cursor.execute(select_query, {"list": tuple(to_store)}) newly_stored = dict(cursor.fetchall()) already_stored.update(newly_stored) current_app.cache.set_many( (build_cache_key(k, source_lang), v) for (k, v) in already_stored.iteritems() )
def translate_unit(self, unit_source, source_lang, target_lang, project_style=None, min_similarity=None, max_candidates=None): """Return TM suggestions for unit_source.""" slang = lang_to_table(source_lang) if slang not in self.source_langs: abort(404) tlang = lang_to_table(target_lang) lang_config = lang_to_config(slang) if slang == tlang: # We really don't want to serve en->en requests. abort(404) if isinstance(unit_source, str): unit_source = unicode(unit_source, "utf-8") checker = project_checker(project_style, source_lang) max_length = current_app.config.get('MAX_LENGTH', 1000) min_similarity = max( min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30) max_candidates = max_candidates or current_app.config.get( 'MAX_CANDIDATES', 5) source_len = len(unit_source) minlen = min_levenshtein_length(source_len, min_similarity) maxlen = max_levenshtein_length(source_len, min_similarity, max_length) minrank = max(min_similarity / 2, 30) cursor = self.get_cursor() query = """ SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid, TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s AND s.vector @@ query) sub WHERE rank > %%(minrank)s ORDER BY rank DESC """ % (slang, slang) cursor.execute( query, { 'search_str': indexing_version(unit_source, checker), 'tlang': tlang, 'lang_config': lang_config, 'minrank': minrank, 'minlen': minlen, 'maxlen': maxlen, }) results = [] similarity = self.comparer.similarity for row in cursor: quality = similarity(unit_source, row['source'], min_similarity) if quality >= min_similarity: result = dict(row) result['quality'] = quality results.append(result) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:max_candidates] print('amagama/tmdb.py - returning these results: %s' % str(results)) return results
def get_all_sids(self, units, source_lang, project_style): """Ensure that all source strings are in the database+cache.""" all_sources = set(u['source'] for u in units) d = current_app.cache.get_dict(*(build_cache_key(k, source_lang) for k in all_sources)) # Filter out None results (keys not found). already_cached = set(filter(lambda x: d[x] is not None, d)) # Unmangle the key to get a source string. # TODO: update for memcached already_cached = set(split_cache_key(k) for k in already_cached) uncached = tuple(all_sources - already_cached) if not uncached: # Everything is already cached. return checker = project_checker(project_style, source_lang) cursor = self.get_cursor() select_query = """SELECT text, sid FROM sources_%s WHERE text IN %%(list)s""" % source_lang to_store = set() already_stored = {} for i in range(1, 4): # During parallel import, another process could have INSERTed a # record just after we SELECTed and just before we INSERTed, # causing a duplicate key. So let's expect that and retry a few # times before we give up: try: cursor.execute(select_query, {"list": uncached}) already_stored = dict(cursor.fetchall()) to_store = all_sources - already_cached - set(already_stored) if not to_store: # Note that we could technically leak the savepoint # "before_sids" (below) if this is not the first iteration # of the loop. It shouldn't matter, and will be destroyed # when we commit anyway. break # Some source strings still need to be stored. insert_query = """INSERT INTO sources_%s (text, vector, length) VALUES( %%(source)s, TO_TSVECTOR(%%(lang_config)s, %%(indexed_source)s), %%(length)s ) RETURNING sid""" % source_lang lang_config = lang_to_config(source_lang) params = [{ "lang_config": lang_config, "source": s, "indexed_source": indexing_version(s, checker), "length": len(s), } for s in to_store] # We sort to avoid deadlocks during parallel import. params.sort(key=lambda x: x['source']) cursor.execute("SAVEPOINT before_sids") cursor.executemany(insert_query, params) cursor.execute("RELEASE SAVEPOINT before_sids") except postgres.psycopg2.IntegrityError: cursor.execute("ROLLBACK TO SAVEPOINT before_sids") else: # No exception means we can break the retry loop. break else: raise Exception("Failed 3 times to import sources") if to_store: # get the inserted rows back so that we have their IDs cursor.execute(select_query, {"list": tuple(to_store)}) newly_stored = dict(cursor.fetchall()) already_stored.update(newly_stored) current_app.cache.set_many((build_cache_key(k, source_lang), v) for (k, v) in already_stored.iteritems())
def translate_unit(self, unit_source, source_lang, target_lang, project_style=None, min_similarity=None, max_candidates=None): """Return TM suggestions for unit_source.""" slang = lang_to_table(source_lang) if slang not in self.source_langs: abort(404) tlang = lang_to_table(target_lang) if slang == tlang: # We really don't want to serve en->en requests. abort(404) lang_config = lang_to_config(slang) if isinstance(unit_source, str): unit_source = unicode(unit_source, "utf-8") checker = project_checker(project_style, source_lang) max_length = current_app.config.get('MAX_LENGTH', 1000) min_similarity = max( min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30) max_candidates = max_candidates or current_app.config.get( 'MAX_CANDIDATES', 5) source_len = len(unit_source) minlen = min_levenshtein_length(source_len, min_similarity) maxlen = max_levenshtein_length(source_len, min_similarity, max_length) minrank = max(min_similarity / 2, 30) cursor = self.get_cursor() query = """ SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid, TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s AND s.vector @@ query) sub WHERE rank > %%(minrank)s ORDER BY rank DESC """ % (slang, slang) try: cursor.execute( query, { 'search_str': indexing_version(unit_source, checker), 'tlang': tlang, 'lang_config': lang_config, 'minrank': minrank, 'minlen': minlen, 'maxlen': maxlen, }) except postgres.psycopg2.ProgrammingError: # Avoid problems parsing strings like '<a "\b">'. If any of the # characters in the example string is not present, then no error is # thrown. The error is still present if any number of other letters # are included between any of the characters in the example string. self.connection.rollback() self.pool.putconn() return [] results = [] similarity = self.comparer.similarity for row in cursor: quality = similarity(unit_source, row['source'], min_similarity) if quality >= min_similarity: result = dict(row) result['quality'] = quality results.append(result) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:max_candidates] return results
def translate_unit(self, unit_source, source_lang, target_lang, project_style=None, min_similarity=None, max_candidates=None): """Return TM suggestions for unit_source.""" slang = lang_to_table(source_lang) if slang not in self.source_langs: abort(404) tlang = lang_to_table(target_lang) if slang == tlang: # We really don't want to serve en->en requests. abort(404) lang_config = lang_to_config(slang) if isinstance(unit_source, str): unit_source = unicode(unit_source, "utf-8") checker = project_checker(project_style, source_lang) max_length = current_app.config.get('MAX_LENGTH', 1000) min_similarity = max(min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30) max_candidates = max_candidates or current_app.config.get('MAX_CANDIDATES', 5) source_len = len(unit_source) minlen = min_levenshtein_length(source_len, min_similarity) maxlen = max_levenshtein_length(source_len, min_similarity, max_length) minrank = max(min_similarity / 2, 30) cursor = self.get_cursor() query = """ SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid, TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s AND s.vector @@ query) sub WHERE rank > %%(minrank)s ORDER BY rank DESC """ % (slang, slang) try: cursor.execute(query, { 'search_str': indexing_version(unit_source, checker), 'tlang': tlang, 'lang_config': lang_config, 'minrank': minrank, 'minlen': minlen, 'maxlen': maxlen, }) except postgres.psycopg2.ProgrammingError: # Avoid problems parsing strings like '<a "\b">'. If any of the # characters in the example string is not present, then no error is # thrown. The error is still present if any number of other letters # are included between any of the characters in the example string. self.connection.rollback() self.pool.putconn() return [] results = [] similarity = self.comparer.similarity for row in cursor: quality = similarity(unit_source, row['source'], min_similarity) if quality >= min_similarity: result = dict(row) result['quality'] = quality results.append(result) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:max_candidates] return results