def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000): self.max_candidates = max_candidates self.min_similarity = min_similarity self.max_length = max_length if not isinstance(db_file, unicode): db_file = unicode(db_file) # don't know which encoding self.db_file = db_file # share connections to same database file between different instances if db_file not in self._tm_dbs: self._tm_dbs[db_file] = {} self._tm_db = self._tm_dbs[db_file] # FIXME: do we want to do any checks before we initialize the DB? self.init_database() self.fulltext = False self.init_fulltext() self.comparer = LevenshteinComparer(self.max_length) self.preload_db()
def _check_alttrans(self, unit): if not hasattr(unit, 'getalttrans'): return [] alttrans = unit.getalttrans() if not alttrans: return [] from translate.search.lshtein import LevenshteinComparer lcomparer = LevenshteinComparer(max_len=1000) results = [] for alt in alttrans: quality = lcomparer.similarity(unit.source, alt.source, self.controller.min_quality - 15) # let's check if it is useful, but be more lenient if quality < self.controller.min_quality - 10: continue tmsource = _('This file') xmlelement = getattr(alt, 'xmlelement', None) if xmlelement is not None: origin = alt.xmlelement.get('origin', '') if origin: if origin == "lmc": # Experimental code to test lmc research. Everything # in a try block, just in case. try: from lxml import etree import os.path extras = xmlelement.xpath( 'processing-instruction()') meta = dict((pi.target, pi.text) for pi in extras) tmsource = [ meta.get("contact-name", ""), meta.get("category", ""), os.path.splitext(meta.get("original", ""))[0] ] tmsource = u"\n".join(filter(None, tmsource)) except Exception, e: import logging logging.info(e) tmsource += "\n" + origin results.append({ 'source': alt.source, 'target': alt.target, 'quality': quality, 'tmsource': tmsource, })
def __init__(self, max_candidates=3, min_similarity=75, max_length=1000): gobject.GObject.__init__(self) HTTPClient.__init__(self) self.max_candidates = max_candidates self.min_similarity = min_similarity self.comparer = LevenshteinComparer(max_length) self.last_suggestions = [] # used by the open-tran terminology backend self._languages = set() self.source_lang = None self.target_lang = None #detect supported language self.url_getlanguages = 'http://open-tran.eu/json/supported' self.url_translate = 'http://%s.%s.open-tran.eu/json/suggest' langreq = RESTRequest(self.url_getlanguages, id='') self.add(langreq) langreq.connect('http-success', lambda langreq, response: self.got_languages(response))
def comparer(self): if not hasattr(self, '_comparer'): max_length = current_app.config.get('MAX_LENGTH', 1000) self._comparer = LevenshteinComparer(max_length) return self._comparer
def get_tm_results(request, unit): """Gets a list of TM results for the current object. :return: JSON string with a list of TM results. """ max_len = settings.LV_MAX_LENGTH min_similarity = settings.LV_MIN_SIMILARITY results = [] # Shortcut Levenshtein comparer, since the distance, by definition, can't # be less than the difference in string length diff_len = unit.source_length * (100 - min_similarity)/100 max_unit_len = unit.source_length + diff_len min_unit_len = unit.source_length - diff_len criteria = { 'target_lang': unit.store.translation_project.language, 'source_lang': unit.store.translation_project.project.source_language, 'source_length__range': (min_unit_len, max_unit_len), } tmunits = TMUnit.objects.filter(**criteria).exclude(unit=unit) comparer = LevenshteinComparer(max_len) for tmunit in tmunits: quality = comparer.similarity(tmunit.source, unit.source, min_similarity) if quality >= min_similarity: project = tmunit.project profile = tmunit.submitted_by result = { 'source': tmunit.source, 'target': tmunit.target, 'quality': quality, 'project': { 'project': project.code, 'projectname': project.fullname, 'absolute_url': project.get_absolute_url(), 'icon': _get_project_icon(project), } } if profile is not None: submissions = Submission.objects.filter( submitter=profile, type=SubmissionTypes.NORMAL, ).distinct().count() suggestions = SuggestionStat.objects.filter( suggester=profile, ).distinct().count() translations = submissions - suggestions # XXX: is this correct? title = _("By %s on %s<br/><br/>%s translations<br/>%s suggestions" % ( profile.user.get_full_name(), tmunit.submitted_on, translations, suggestions)) result['translator'] = { 'username': unicode(profile.user), 'title': title, 'absolute_url': profile.get_absolute_url(), 'gravatar': profile.gravatar_url(24), } results.append(result) return HttpResponse(jsonify(results), mimetype="application/json")