Пример #1
0
    def translate_unit(self,
                       unit_source,
                       source_lang,
                       target_lang,
                       project_style=None,
                       min_similarity=None,
                       max_candidates=None):
        """Return TM suggestions for unit_source."""
        slang = lang_to_table(source_lang)
        if slang not in self.source_langs:
            abort(404)

        tlang = lang_to_table(target_lang)
        if slang == tlang:
            # We really don't want to serve en->en requests.
            abort(404)

        if isinstance(unit_source, bytes):
            unit_source = unicode(unit_source, "utf-8")

        checker = project_checker(project_style, source_lang)

        max_length = current_app.config.get('MAX_LENGTH', 2000)
        min_similarity = max(
            min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 70)
        max_candidates = min(
            max_candidates or current_app.config.get('MAX_CANDIDATES', 5), 30)

        source_len = len(unit_source)
        minlen = min_levenshtein_length(source_len, min_similarity)
        maxlen = max_levenshtein_length(source_len, min_similarity, max_length)

        minrank = max(min_similarity / 2, 30)

        cursor = self.get_cursor(slang)
        try:
            self._translate_query(cursor, slang, tlang,
                                  indexing_version(unit_source, checker),
                                  minlen, maxlen, minrank)
        except postgres.psycopg2.ProgrammingError:
            # Avoid problems parsing strings like '<a "\b">'. If any of the
            # characters in the example string is not present, then no error is
            # thrown. The error is still present if any number of other letters
            # are included between any of the characters in the example string.
            cursor.connection.rollback()
            return []

        results = []
        similarity = self.comparer.similarity
        for row in cursor:
            quality = similarity(unit_source, row['source'], min_similarity)
            if quality >= min_similarity:
                result = dict(row)
                result['quality'] = quality
                results.append(result)
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:max_candidates]
        return results
Пример #2
0
    def translate_unit(self, unit_source, source_lang, target_lang, project_style=None,
                       min_similarity=None, max_candidates=None):
        """return TM suggestions for unit_source"""
        slang = lang_to_table(source_lang)
        if slang not in self.source_langs:
            abort(404)

        tlang = lang_to_table(target_lang)
        lang_config = lang_to_config(slang)

        if slang == tlang:
            # We really don't want to serve en->en requests
            abort(404)

        if isinstance(unit_source, str):
            unit_source = unicode(unit_source, "utf-8")

        checker = project_checker(project_style, source_lang)

        max_length = current_app.config.get('MAX_LENGTH', 1000)
        min_similarity = max(min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30)
        max_candidates = max_candidates or current_app.config.get('MAX_CANDIDATES', 5)

        source_len = len(unit_source)
        minlen = min_levenshtein_length(source_len, min_similarity)
        maxlen = max_levenshtein_length(source_len, min_similarity, max_length)

        minrank = max(min_similarity / 2, 30)

        cursor = self.get_cursor()
        query = """
SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank
    FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid,
    TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query
    WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s
    AND s.vector @@ query) sub WHERE rank > %%(minrank)s
    ORDER BY rank DESC
""" % (slang, slang)
        cursor.execute(query, {'search_str': indexing_version(unit_source, checker),
                               'tlang': tlang, 'lang_config': lang_config,
                               'minrank': minrank, 'minlen': minlen, 'maxlen': maxlen})
        results = []
        similarity = self.comparer.similarity
        for row in cursor:
            quality = similarity(unit_source, row['source'], min_similarity)
            if quality >= min_similarity:
                result = dict(row)
                result['quality'] = quality
                results.append(result)
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:max_candidates]
        return results
Пример #3
0
    def get_all_sids(self, units, source_lang, project_style):
        """Ensure that all source strings are in the database+cache."""
        all_sources = set(u['source'] for u in units)

        d = current_app.cache.get_dict(*(
            build_cache_key(k, source_lang) for k in all_sources
        ))
        # Filter out None results (keys not found).
        already_cached = set(filter(lambda x: d[x] is not None, d))
        # Unmangle the key to get a source string.
        # TODO: update for memcached
        already_cached = set(split_cache_key(k) for k in already_cached)

        uncached = tuple(all_sources - already_cached)
        if not uncached:
            # Everything is already cached.
            return

        checker = project_checker(project_style, source_lang)

        cursor = self.get_cursor()
        select_query = """SELECT text, sid FROM sources_%s WHERE
        text IN %%(list)s""" % source_lang

        to_store = set()
        already_stored = {}
        for i in range(1, 4):
            # During parallel import, another process could have INSERTed a
            # record just after we SELECTed and just before we INSERTed,
            # causing a duplicate key. So let's expect that and retry a few
            # times before we give up:
            try:
                cursor.execute(select_query, {"list": uncached})
                already_stored = dict(cursor.fetchall())

                to_store = all_sources - already_cached - set(already_stored)
                if not to_store:
                    # Note that we could technically leak the savepoint
                    # "before_sids" (below) if this is not the first iteration
                    # of the loop. It shouldn't matter, and will be destroyed
                    # when we commit anyway.
                    break

                # Some source strings still need to be stored.
                insert_query = """INSERT INTO sources_%s (text, vector, length)
                VALUES(
                    %%(source)s,
                    TO_TSVECTOR(%%(lang_config)s, %%(indexed_source)s),
                    %%(length)s
                ) RETURNING sid""" % source_lang

                lang_config = lang_to_config(source_lang)
                params = [{
                    "lang_config": lang_config,
                    "source": s,
                    "indexed_source": indexing_version(s, checker),
                    "length": len(s),
                } for s in to_store
                ]
                # We sort to avoid deadlocks during parallel import.
                params.sort(key=lambda x: x['source'])

                cursor.execute("SAVEPOINT before_sids")
                cursor.executemany(insert_query, params)
                cursor.execute("RELEASE SAVEPOINT before_sids")
            except postgres.psycopg2.IntegrityError:
                cursor.execute("ROLLBACK TO SAVEPOINT before_sids")
            else:
                # No exception means we can break the retry loop.
                break
        else:
            raise Exception("Failed 3 times to import sources")

        if to_store:
            # get the inserted rows back so that we have their IDs
            cursor.execute(select_query, {"list": tuple(to_store)})
            newly_stored = dict(cursor.fetchall())
            already_stored.update(newly_stored)

        current_app.cache.set_many(
            (build_cache_key(k, source_lang), v)
            for (k, v) in already_stored.iteritems()
        )
Пример #4
0
    def translate_unit(self,
                       unit_source,
                       source_lang,
                       target_lang,
                       project_style=None,
                       min_similarity=None,
                       max_candidates=None):
        """Return TM suggestions for unit_source."""
        slang = lang_to_table(source_lang)
        if slang not in self.source_langs:
            abort(404)

        tlang = lang_to_table(target_lang)
        lang_config = lang_to_config(slang)

        if slang == tlang:
            # We really don't want to serve en->en requests.
            abort(404)

        if isinstance(unit_source, str):
            unit_source = unicode(unit_source, "utf-8")

        checker = project_checker(project_style, source_lang)

        max_length = current_app.config.get('MAX_LENGTH', 1000)
        min_similarity = max(
            min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30)
        max_candidates = max_candidates or current_app.config.get(
            'MAX_CANDIDATES', 5)

        source_len = len(unit_source)
        minlen = min_levenshtein_length(source_len, min_similarity)
        maxlen = max_levenshtein_length(source_len, min_similarity, max_length)

        minrank = max(min_similarity / 2, 30)

        cursor = self.get_cursor()
        query = """
SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank
    FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid,
    TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query
    WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s
    AND s.vector @@ query) sub WHERE rank > %%(minrank)s
    ORDER BY rank DESC
""" % (slang, slang)
        cursor.execute(
            query, {
                'search_str': indexing_version(unit_source, checker),
                'tlang': tlang,
                'lang_config': lang_config,
                'minrank': minrank,
                'minlen': minlen,
                'maxlen': maxlen,
            })
        results = []
        similarity = self.comparer.similarity
        for row in cursor:
            quality = similarity(unit_source, row['source'], min_similarity)
            if quality >= min_similarity:
                result = dict(row)
                result['quality'] = quality
                results.append(result)
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:max_candidates]
        print('amagama/tmdb.py - returning these results: %s' % str(results))
        return results
Пример #5
0
    def get_all_sids(self, units, source_lang, project_style):
        """Ensure that all source strings are in the database+cache."""
        all_sources = set(u['source'] for u in units)

        d = current_app.cache.get_dict(*(build_cache_key(k, source_lang)
                                         for k in all_sources))
        # Filter out None results (keys not found).
        already_cached = set(filter(lambda x: d[x] is not None, d))
        # Unmangle the key to get a source string.
        # TODO: update for memcached
        already_cached = set(split_cache_key(k) for k in already_cached)

        uncached = tuple(all_sources - already_cached)
        if not uncached:
            # Everything is already cached.
            return

        checker = project_checker(project_style, source_lang)

        cursor = self.get_cursor()
        select_query = """SELECT text, sid FROM sources_%s WHERE
        text IN %%(list)s""" % source_lang

        to_store = set()
        already_stored = {}
        for i in range(1, 4):
            # During parallel import, another process could have INSERTed a
            # record just after we SELECTed and just before we INSERTed,
            # causing a duplicate key. So let's expect that and retry a few
            # times before we give up:
            try:
                cursor.execute(select_query, {"list": uncached})
                already_stored = dict(cursor.fetchall())

                to_store = all_sources - already_cached - set(already_stored)
                if not to_store:
                    # Note that we could technically leak the savepoint
                    # "before_sids" (below) if this is not the first iteration
                    # of the loop. It shouldn't matter, and will be destroyed
                    # when we commit anyway.
                    break

                # Some source strings still need to be stored.
                insert_query = """INSERT INTO sources_%s (text, vector, length)
                VALUES(
                    %%(source)s,
                    TO_TSVECTOR(%%(lang_config)s, %%(indexed_source)s),
                    %%(length)s
                ) RETURNING sid""" % source_lang

                lang_config = lang_to_config(source_lang)
                params = [{
                    "lang_config": lang_config,
                    "source": s,
                    "indexed_source": indexing_version(s, checker),
                    "length": len(s),
                } for s in to_store]
                # We sort to avoid deadlocks during parallel import.
                params.sort(key=lambda x: x['source'])

                cursor.execute("SAVEPOINT before_sids")
                cursor.executemany(insert_query, params)
                cursor.execute("RELEASE SAVEPOINT before_sids")
            except postgres.psycopg2.IntegrityError:
                cursor.execute("ROLLBACK TO SAVEPOINT before_sids")
            else:
                # No exception means we can break the retry loop.
                break
        else:
            raise Exception("Failed 3 times to import sources")

        if to_store:
            # get the inserted rows back so that we have their IDs
            cursor.execute(select_query, {"list": tuple(to_store)})
            newly_stored = dict(cursor.fetchall())
            already_stored.update(newly_stored)

        current_app.cache.set_many((build_cache_key(k, source_lang), v)
                                   for (k, v) in already_stored.iteritems())
Пример #6
0
    def translate_unit(self,
                       unit_source,
                       source_lang,
                       target_lang,
                       project_style=None,
                       min_similarity=None,
                       max_candidates=None):
        """Return TM suggestions for unit_source."""
        slang = lang_to_table(source_lang)
        if slang not in self.source_langs:
            abort(404)

        tlang = lang_to_table(target_lang)
        if slang == tlang:
            # We really don't want to serve en->en requests.
            abort(404)

        lang_config = lang_to_config(slang)

        if isinstance(unit_source, str):
            unit_source = unicode(unit_source, "utf-8")

        checker = project_checker(project_style, source_lang)

        max_length = current_app.config.get('MAX_LENGTH', 1000)
        min_similarity = max(
            min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30)
        max_candidates = max_candidates or current_app.config.get(
            'MAX_CANDIDATES', 5)

        source_len = len(unit_source)
        minlen = min_levenshtein_length(source_len, min_similarity)
        maxlen = max_levenshtein_length(source_len, min_similarity, max_length)

        minrank = max(min_similarity / 2, 30)

        cursor = self.get_cursor()
        query = """
SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank
    FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid,
    TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query
    WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s
    AND s.vector @@ query) sub WHERE rank > %%(minrank)s
    ORDER BY rank DESC
""" % (slang, slang)
        try:
            cursor.execute(
                query, {
                    'search_str': indexing_version(unit_source, checker),
                    'tlang': tlang,
                    'lang_config': lang_config,
                    'minrank': minrank,
                    'minlen': minlen,
                    'maxlen': maxlen,
                })
        except postgres.psycopg2.ProgrammingError:
            # Avoid problems parsing strings like '<a "\b">'. If any of the
            # characters in the example string is not present, then no error is
            # thrown. The error is still present if any number of other letters
            # are included between any of the characters in the example string.
            self.connection.rollback()
            self.pool.putconn()
            return []

        results = []
        similarity = self.comparer.similarity
        for row in cursor:
            quality = similarity(unit_source, row['source'], min_similarity)
            if quality >= min_similarity:
                result = dict(row)
                result['quality'] = quality
                results.append(result)
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:max_candidates]
        return results
Пример #7
0
    def translate_unit(self, unit_source, source_lang, target_lang,
                       project_style=None, min_similarity=None,
                       max_candidates=None):
        """Return TM suggestions for unit_source."""
        slang = lang_to_table(source_lang)
        if slang not in self.source_langs:
            abort(404)

        tlang = lang_to_table(target_lang)
        if slang == tlang:
            # We really don't want to serve en->en requests.
            abort(404)

        lang_config = lang_to_config(slang)

        if isinstance(unit_source, str):
            unit_source = unicode(unit_source, "utf-8")

        checker = project_checker(project_style, source_lang)

        max_length = current_app.config.get('MAX_LENGTH', 1000)
        min_similarity = max(min_similarity or current_app.config.get('MIN_SIMILARITY', 70), 30)
        max_candidates = max_candidates or current_app.config.get('MAX_CANDIDATES', 5)

        source_len = len(unit_source)
        minlen = min_levenshtein_length(source_len, min_similarity)
        maxlen = max_levenshtein_length(source_len, min_similarity, max_length)

        minrank = max(min_similarity / 2, 30)

        cursor = self.get_cursor()
        query = """
SELECT * from (SELECT s.text AS source, t.text AS target, TS_RANK(s.vector, query, 32) * 1744.93406073519 AS rank
    FROM sources_%s s JOIN targets_%s t ON s.sid = t.sid,
    TO_TSQUERY(%%(lang_config)s, prepare_ortsquery(%%(search_str)s)) query
    WHERE t.lang = %%(tlang)s AND s.length BETWEEN %%(minlen)s AND %%(maxlen)s
    AND s.vector @@ query) sub WHERE rank > %%(minrank)s
    ORDER BY rank DESC
""" % (slang, slang)
        try:
            cursor.execute(query, {
                'search_str': indexing_version(unit_source, checker),
                'tlang': tlang,
                'lang_config': lang_config,
                'minrank': minrank,
                'minlen': minlen,
                'maxlen': maxlen,
            })
        except postgres.psycopg2.ProgrammingError:
            # Avoid problems parsing strings like '<a "\b">'. If any of the
            # characters in the example string is not present, then no error is
            # thrown. The error is still present if any number of other letters
            # are included between any of the characters in the example string.
            self.connection.rollback()
            self.pool.putconn()
            return []

        results = []
        similarity = self.comparer.similarity
        for row in cursor:
            quality = similarity(unit_source, row['source'], min_similarity)
            if quality >= min_similarity:
                result = dict(row)
                result['quality'] = quality
                results.append(result)
        results.sort(key=lambda match: match['quality'], reverse=True)
        results = results[:max_candidates]
        return results