def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000): self.max_candidates = max_candidates self.min_similarity = min_similarity self.max_length = max_length if not isinstance(db_file, unicode): db_file = unicode(db_file) # don't know which encoding self.db_file = db_file # share connections to same database file between different instances if db_file not in self._tm_dbs: self._tm_dbs[db_file] = {} self._tm_db = self._tm_dbs[db_file] # FIXME: do we want to do any checks before we initialize the DB? self.init_database() self.fulltext = False self.init_fulltext() self.comparer = LevenshteinComparer(self.max_length) self.preload_db()
def _check_xliff_alttrans(self, unit): if not hasattr(unit, 'getalttrans'): return [] alttrans = unit.getalttrans() if not alttrans: return [] from translate.search.lshtein import LevenshteinComparer lcomparer = LevenshteinComparer(max_len=1000) results = [] for alt in alttrans: tmsource = _('This file') xmlelement = getattr(alt, 'xmlelement', None) if xmlelement: origin = alt.xmlelement.get('origin', '') if origin: tmsource += "\n" + origin results.append({ 'source': alt.source, 'target': alt.target, 'quality': lcomparer.similarity(unit.source, alt.source, 0), 'tmsource': tmsource, }) return results
def _check_alttrans(self, unit): if not hasattr(unit, 'getalttrans'): return [] alttrans = unit.getalttrans() if not alttrans: return [] from translate.search.lshtein import LevenshteinComparer lcomparer = LevenshteinComparer(max_len=1000) results = [] for alt in alttrans: quality = lcomparer.similarity(unit.source, alt.source, self.controller.min_quality - 15) # let's check if it is useful, but be more lenient if quality < self.controller.min_quality - 10: continue tmsource = _('This file') xmlelement = getattr(alt, 'xmlelement', None) if xmlelement is not None: origin = alt.xmlelement.get('origin', '') if origin: if origin == "lmc": # Experimental code to test lmc research. Everything # in a try block, just in case. try: from lxml import etree import os.path extras = xmlelement.xpath( 'processing-instruction()') meta = dict((pi.target, pi.text) for pi in extras) tmsource = [ meta.get("contact-name", ""), meta.get("category", ""), os.path.splitext(meta.get("original", ""))[0] ] tmsource = u"\n".join(filter(None, tmsource)) except Exception, e: import logging logging.info(e) tmsource += "\n" + origin results.append({ 'source': alt.source, 'target': alt.target, 'quality': quality, 'tmsource': tmsource, })
def _check_alttrans(self, unit): if not hasattr(unit, 'getalttrans'): return [] alttrans = unit.getalttrans() if not alttrans: return [] from translate.search.lshtein import LevenshteinComparer lcomparer = LevenshteinComparer(max_len=1000) results = [] for alt in alttrans: quality = lcomparer.similarity(unit.source, alt.source, self.controller.min_quality - 15) # let's check if it is useful, but be more lenient if quality < self.controller.min_quality - 10: continue tmsource = _('This file') xmlelement = getattr(alt, 'xmlelement', None) if xmlelement is not None: origin = alt.xmlelement.get('origin', '') if origin: if origin == "lmc": # Experimental code to test lmc research. Everything # in a try block, just in case. try: from lxml import etree import os.path extras = xmlelement.xpath('processing-instruction()') meta = dict((pi.target, pi.text) for pi in extras) tmsource = [meta.get("contact-name", ""), meta.get("category", ""), os.path.splitext(meta.get("original", ""))[0]] tmsource = u"\n".join(filter(None, tmsource)) except Exception, e: import logging logging.info(e) tmsource += "\n" + origin results.append({ 'source': alt.source, 'target': alt.target, 'quality': quality, 'tmsource': tmsource, })
def __init__(self, max_candidates=3, min_similarity=75, max_length=1000): gobject.GObject.__init__(self) HTTPClient.__init__(self) self.max_candidates = max_candidates self.min_similarity = min_similarity self.comparer = LevenshteinComparer(max_length) self.last_suggestions = [] # used by the open-tran terminology backend self._languages = set() self.source_lang = None self.target_lang = None #detect supported language self.url_getlanguages = 'http://open-tran.eu/json/supported' self.url_translate = 'http://%s.%s.open-tran.eu/json/suggest' langreq = RESTRequest(self.url_getlanguages, id='') self.add(langreq) langreq.connect('http-success', lambda langreq, response: self.got_languages(response))
def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000): self.max_candidates = max_candidates self.min_similarity = min_similarity self.max_length = max_length if not isinstance(db_file, six.text_type): db_file = six.text_type(db_file) # don't know which encoding self.db_file = db_file # share connections to same database file between different instances if db_file not in self._tm_dbs: self._tm_dbs[db_file] = {} self._tm_db = self._tm_dbs[db_file] # FIXME: do we want to do any checks before we initialize the DB? self.init_database() self.fulltext = False self.init_fulltext() self.comparer = LevenshteinComparer(self.max_length) self.preload_db()
def get_tm_results(request, unit): """Gets a list of TM results for the current object. :return: JSON string with a list of TM results. """ max_len = settings.LV_MAX_LENGTH min_similarity = settings.LV_MIN_SIMILARITY results = [] # Shortcut Levenshtein comparer, since the distance, by definition, can't # be less than the difference in string length diff_len = unit.source_length * (100 - min_similarity)/100 max_unit_len = unit.source_length + diff_len min_unit_len = unit.source_length - diff_len criteria = { 'target_lang': unit.store.translation_project.language, 'source_lang': unit.store.translation_project.project.source_language, 'source_length__range': (min_unit_len, max_unit_len), } tmunits = TMUnit.objects.filter(**criteria).exclude(unit=unit) comparer = LevenshteinComparer(max_len) for tmunit in tmunits: quality = comparer.similarity(tmunit.source, unit.source, min_similarity) if quality >= min_similarity: project = tmunit.project profile = tmunit.submitted_by result = { 'source': tmunit.source, 'target': tmunit.target, 'quality': quality, 'project': { 'project': project.code, 'projectname': project.fullname, 'absolute_url': project.get_absolute_url(), 'icon': _get_project_icon(project), } } if profile is not None: submissions = Submission.objects.filter( submitter=profile, type=SubmissionTypes.NORMAL, ).distinct().count() suggestions = SuggestionStat.objects.filter( suggester=profile, ).distinct().count() translations = submissions - suggestions # XXX: is this correct? title = _("By %s on %s<br/><br/>%s translations<br/>%s suggestions" % ( profile.user.get_full_name(), tmunit.submitted_on, translations, suggestions)) result['translator'] = { 'username': unicode(profile.user), 'title': title, 'absolute_url': profile.get_absolute_url(), 'gravatar': profile.gravatar_url(24), } results.append(result) return HttpResponse(jsonify(results), mimetype="application/json")
class TMDB(object): _tm_dbs = {} def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000): self.max_candidates = max_candidates self.min_similarity = min_similarity self.max_length = max_length if not isinstance(db_file, unicode): db_file = unicode(db_file) # don't know which encoding self.db_file = db_file # share connections to same database file between different instances if db_file not in self._tm_dbs: self._tm_dbs[db_file] = {} self._tm_db = self._tm_dbs[db_file] # FIXME: do we want to do any checks before we initialize the DB? self.init_database() self.fulltext = False self.init_fulltext() self.comparer = LevenshteinComparer(self.max_length) self.preload_db() def _get_connection(self, index): current_thread = threading.currentThread() if current_thread not in self._tm_db: connection = dbapi2.connect(self.db_file.encode('utf-8')) cursor = connection.cursor() self._tm_db[current_thread] = (connection, cursor) return self._tm_db[current_thread][index] connection = property(lambda self: self._get_connection(0)) cursor = property(lambda self: self._get_connection(1)) def init_database(self): """creates database tables and indices""" script = """ CREATE TABLE IF NOT EXISTS sources ( sid INTEGER PRIMARY KEY AUTOINCREMENT, text VARCHAR NOT NULL, context VARCHAR DEFAULT NULL, lang VARCHAR NOT NULL, length INTEGER NOT NULL ); CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context); CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang); CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length); CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang); CREATE TABLE IF NOT EXISTS targets ( tid INTEGER PRIMARY KEY AUTOINCREMENT, sid INTEGER NOT NULL, text VARCHAR NOT NULL, lang VARCHAR NOT NULL, time INTEGER DEFAULT NULL, FOREIGN KEY (sid) references sources(sid) ); CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid); CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang); CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time); CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang); """ try: self.cursor.executescript(script) self.connection.commit() except: self.connection.rollback() raise def init_fulltext(self): """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does""" # HACKISH: no better way to detect fts3 support except trying to # construct a dummy table?! try: script = """ DROP TABLE IF EXISTS test_for_fts3; CREATE VIRTUAL TABLE test_for_fts3 USING fts3; DROP TABLE test_for_fts3; """ self.cursor.executescript(script) logging.debug("fts3 supported") # for some reason CREATE VIRTUAL TABLE doesn't support IF NOT # EXISTS syntax check if fulltext index table exists manually self.cursor.execute( "SELECT name FROM sqlite_master WHERE name = 'fulltext'") if not self.cursor.fetchone(): # create fulltext index table, and index all strings in sources script = """ CREATE VIRTUAL TABLE fulltext USING fts3(text); """ logging.debug("fulltext table not exists, creating") self.cursor.executescript(script) logging.debug("created fulltext table") else: logging.debug("fulltext table already exists") # create triggers that would sync sources table with fulltext index script = """ INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext); CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW BEGIN INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text); END; CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW BEGIN UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid; END; CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW BEGIN DELETE FROM fulltext WHERE docid = OLD.sid; END; """ self.cursor.executescript(script) self.connection.commit() logging.debug("created fulltext triggers") self.fulltext = True except dbapi2.OperationalError as e: self.fulltext = False logging.debug("failed to initialize fts3 support: " + str(e)) script = """ DROP TRIGGER IF EXISTS sources_insert_trig; DROP TRIGGER IF EXISTS sources_update_trig; DROP TRIGGER IF EXISTS sources_delete_trig; """ self.cursor.executescript(script) def preload_db(self): """ugly hack to force caching of sqlite db file in memory for improved performance""" if self.fulltext: query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid""" else: query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid""" self.cursor.execute(query) (numrows, ) = self.cursor.fetchone() logging.debug("tmdb has %d records" % numrows) return numrows def add_unit(self, unit, source_lang=None, target_lang=None, commit=True): """inserts unit in the database""" # TODO: is that really the best way to handle unspecified # source and target languages? what about conflicts between # unit attributes and passed arguments if unit.getsourcelanguage(): source_lang = unit.getsourcelanguage() if unit.gettargetlanguage(): target_lang = unit.gettargetlanguage() if not source_lang: raise LanguageError("undefined source language") if not target_lang: raise LanguageError("undefined target language") unitdict = { "source": unit.source, "target": unit.target, "context": unit.getcontext(), } self.add_dict(unitdict, source_lang, target_lang, commit) def add_dict(self, unit, source_lang, target_lang, commit=True): """inserts units represented as dictionaries in database""" source_lang = data.normalize_code(source_lang) target_lang = data.normalize_code(target_lang) try: try: self.cursor.execute( "INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)", (unit["source"], unit["context"], source_lang, len(unit["source"]))) sid = self.cursor.lastrowid except dbapi2.IntegrityError: # source string already exists in db, run query to find sid self.cursor.execute( "SELECT sid FROM sources WHERE text=? AND context=? and lang=?", (unit["source"], unit["context"], source_lang)) sid = self.cursor.fetchone() (sid, ) = sid try: # FIXME: get time info from translation store # FIXME: do we need so store target length? self.cursor.execute( "INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)", (sid, unit["target"], target_lang, int(time.time()))) except dbapi2.IntegrityError: # target string already exists in db, do nothing pass if commit: self.connection.commit() except: if commit: self.connection.rollback() raise def add_store(self, store, source_lang, target_lang, commit=True): """insert all units in store in database""" count = 0 for unit in store.units: if unit.istranslatable() and unit.istranslated(): self.add_unit(unit, source_lang, target_lang, commit=False) count += 1 if commit: self.connection.commit() return count def add_list(self, units, source_lang, target_lang, commit=True): """insert all units in list into the database, units are represented as dictionaries""" count = 0 for unit in units: self.add_dict(unit, source_lang, target_lang, commit=False) count += 1 if commit: self.connection.commit() return count def translate_unit(self, unit_source, source_langs, target_langs): """return TM suggestions for unit_source""" if isinstance(unit_source, str): unit_source = unicode(unit_source, "utf-8") if isinstance(source_langs, list): source_langs = [data.normalize_code(lang) for lang in source_langs] source_langs = ','.join(source_langs) else: source_langs = data.normalize_code(source_langs) if isinstance(target_langs, list): target_langs = [data.normalize_code(lang) for lang in target_langs] target_langs = ','.join(target_langs) else: target_langs = data.normalize_code(target_langs) minlen = min_levenshtein_length(len(unit_source), self.min_similarity) maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length) # split source into words, remove punctuation and special # chars, keep words that are at least 3 chars long unit_words = STRIP_REGEXP.sub(' ', unit_source).split() unit_words = filter(lambda word: len(word) > 2, unit_words) if self.fulltext and len(unit_words) > 3: logging.debug("fulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ? AND fulltext MATCH ?""" search_str = " OR ".join(unit_words) self.cursor.execute( query, (source_langs, target_langs, minlen, maxlen, search_str)) else: logging.debug("nonfulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length >= ? AND s.length <= ?""" self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen)) results = [] for row in self.cursor: quality = self.comparer.similarity(unit_source, row[0], self.min_similarity) if quality >= self.min_similarity: results.append({ 'source': row[0], 'target': row[1], 'context': row[2], 'quality': quality, }) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:self.max_candidates] logging.debug("results: %s", unicode(results)) return results
def similarity(self, a, b, stoppercentage=50): return int(LevenshteinComparer.similarity(self, a, b, stoppercentage))
def __init__(self, max_len=10000): LevenshteinComparer.__init__(self, max_len)
def comparer(self): if not hasattr(self, '_comparer'): max_length = current_app.config.get('MAX_LENGTH', 1000) self._comparer = LevenshteinComparer(max_length) return self._comparer
class TMDB(object): _tm_dbs = {} def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000): self.max_candidates = max_candidates self.min_similarity = min_similarity self.max_length = max_length if not isinstance(db_file, six.text_type): db_file = six.text_type(db_file) # don't know which encoding self.db_file = db_file # share connections to same database file between different instances if db_file not in self._tm_dbs: self._tm_dbs[db_file] = {} self._tm_db = self._tm_dbs[db_file] # FIXME: do we want to do any checks before we initialize the DB? self.init_database() self.fulltext = False self.init_fulltext() self.comparer = LevenshteinComparer(self.max_length) self.preload_db() def _get_connection(self, index): current_thread = threading.currentThread() if current_thread not in self._tm_db: connection = dbapi2.connect(self.db_file.encode('utf-8') if six.PY2 else self.db_file) cursor = connection.cursor() self._tm_db[current_thread] = (connection, cursor) return self._tm_db[current_thread][index] connection = property(lambda self: self._get_connection(0)) cursor = property(lambda self: self._get_connection(1)) def init_database(self): """creates database tables and indices""" script = """ CREATE TABLE IF NOT EXISTS sources ( sid INTEGER PRIMARY KEY AUTOINCREMENT, text VARCHAR NOT NULL, context VARCHAR DEFAULT NULL, lang VARCHAR NOT NULL, length INTEGER NOT NULL ); CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context); CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang); CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length); CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang); CREATE TABLE IF NOT EXISTS targets ( tid INTEGER PRIMARY KEY AUTOINCREMENT, sid INTEGER NOT NULL, text VARCHAR NOT NULL, lang VARCHAR NOT NULL, time INTEGER DEFAULT NULL, FOREIGN KEY (sid) references sources(sid) ); CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid); CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang); CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time); CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang); """ try: self.cursor.executescript(script) self.connection.commit() except Exception: self.connection.rollback() raise def init_fulltext(self): """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does""" # HACKISH: no better way to detect fts3 support except trying to # construct a dummy table?! try: script = """ DROP TABLE IF EXISTS test_for_fts3; CREATE VIRTUAL TABLE test_for_fts3 USING fts3; DROP TABLE test_for_fts3; """ self.cursor.executescript(script) logging.debug("fts3 supported") # for some reason CREATE VIRTUAL TABLE doesn't support IF NOT # EXISTS syntax check if fulltext index table exists manually self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'") if not self.cursor.fetchone(): # create fulltext index table, and index all strings in sources script = """ CREATE VIRTUAL TABLE fulltext USING fts3(text); """ logging.debug("fulltext table not exists, creating") self.cursor.executescript(script) logging.debug("created fulltext table") else: logging.debug("fulltext table already exists") # create triggers that would sync sources table with fulltext index script = """ INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext); CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW BEGIN INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text); END; CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW BEGIN UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid; END; CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW BEGIN DELETE FROM fulltext WHERE docid = OLD.sid; END; """ self.cursor.executescript(script) self.connection.commit() logging.debug("created fulltext triggers") self.fulltext = True except dbapi2.OperationalError as e: self.fulltext = False logging.debug("failed to initialize fts3 support: " + str(e)) script = """ DROP TRIGGER IF EXISTS sources_insert_trig; DROP TRIGGER IF EXISTS sources_update_trig; DROP TRIGGER IF EXISTS sources_delete_trig; """ self.cursor.executescript(script) def preload_db(self): """ugly hack to force caching of sqlite db file in memory for improved performance """ if self.fulltext: query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid""" else: query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid""" self.cursor.execute(query) (numrows,) = self.cursor.fetchone() logging.debug("tmdb has %d records" % numrows) return numrows def add_unit(self, unit, source_lang=None, target_lang=None, commit=True): """inserts unit in the database""" # TODO: is that really the best way to handle unspecified # source and target languages? what about conflicts between # unit attributes and passed arguments if unit.getsourcelanguage(): source_lang = unit.getsourcelanguage() if unit.gettargetlanguage(): target_lang = unit.gettargetlanguage() if not source_lang: raise LanguageError("undefined source language") if not target_lang: raise LanguageError("undefined target language") unitdict = { "source": unit.source, "target": unit.target, "context": unit.getcontext(), } self.add_dict(unitdict, source_lang, target_lang, commit) def add_dict(self, unit, source_lang, target_lang, commit=True): """inserts units represented as dictionaries in database""" source_lang = data.normalize_code(source_lang) target_lang = data.normalize_code(target_lang) try: try: self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)", (unit["source"], unit["context"], source_lang, len(unit["source"]))) sid = self.cursor.lastrowid except dbapi2.IntegrityError: # source string already exists in db, run query to find sid self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?", (unit["source"], unit["context"], source_lang)) sid = self.cursor.fetchone() (sid,) = sid try: # FIXME: get time info from translation store # FIXME: do we need so store target length? self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)", (sid, unit["target"], target_lang, int(time.time()))) except dbapi2.IntegrityError: # target string already exists in db, do nothing pass if commit: self.connection.commit() except Exception: if commit: self.connection.rollback() raise def add_store(self, store, source_lang, target_lang, commit=True): """insert all units in store in database""" count = 0 for unit in store.units: if unit.istranslatable() and unit.istranslated(): self.add_unit(unit, source_lang, target_lang, commit=False) count += 1 if commit: self.connection.commit() return count def add_list(self, units, source_lang, target_lang, commit=True): """insert all units in list into the database, units are represented as dictionaries """ count = 0 for unit in units: self.add_dict(unit, source_lang, target_lang, commit=False) count += 1 if commit: self.connection.commit() return count def translate_unit(self, unit_source, source_langs, target_langs): """return TM suggestions for unit_source""" if isinstance(unit_source, bytes): unit_source = unit_source.decode("utf-8") if isinstance(source_langs, list): source_langs = [data.normalize_code(lang) for lang in source_langs] source_langs = ','.join(source_langs) else: source_langs = data.normalize_code(source_langs) if isinstance(target_langs, list): target_langs = [data.normalize_code(lang) for lang in target_langs] target_langs = ','.join(target_langs) else: target_langs = data.normalize_code(target_langs) minlen = min_levenshtein_length(len(unit_source), self.min_similarity) maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length) # split source into words, remove punctuation and special # chars, keep words that are at least 3 chars long unit_words = STRIP_REGEXP.sub(' ', unit_source).split() unit_words = list(filter(lambda word: len(word) > 2, unit_words)) if self.fulltext and len(unit_words) > 3: logging.debug("fulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ? AND fulltext MATCH ?""" search_str = " OR ".join(unit_words) self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str)) else: logging.debug("nonfulltext matching") query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid WHERE s.lang IN (?) AND t.lang IN (?) AND s.length >= ? AND s.length <= ?""" self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen)) results = [] for row in self.cursor: quality = self.comparer.similarity(unit_source, row[0], self.min_similarity) if quality >= self.min_similarity: results.append({ 'source': row[0], 'target': row[1], 'context': row[2], 'quality': quality, }) results.sort(key=lambda match: match['quality'], reverse=True) results = results[:self.max_candidates] logging.debug("results: %s", six.text_type(results)) return results
def similarity(self, a, b, stoppercentage=50): return int( LevenshteinComparer.similarity(self, a, b, stoppercentage) )