def more_like_this(self, unit, top=5): ''' Finds closely similar units. ''' index = FULLTEXT_INDEX.source_searcher() source_string = unit.get_source_plurals()[0] parser = qparser.QueryParser('source', SOURCE_SCHEMA) parsed = parser.parse(source_string) checksums = set() with index as searcher: # Search for same string results = searcher.search(parsed) if len(results) == 0: return self.none() first_hit = results[0] # Find similar results to first one more_results = first_hit.more_like_this( 'source', source_string, top ) # Include all more like this results for result in more_results: checksums.add(result['checksum']) # Remove all original matches for result in results: checksums.discard(result['checksum']) return self.filter( checksum__in=checksums, translation__language=unit.translation.language, translated=True ).exclude( pk=unit.id )
def search(self, query, source=True, context=True, translation=True, checksums=False): """ Performs full text search on defined set of fields. Returns queryset unless checksums is set. """ ret = set() # Search in source or context if source or context: index = FULLTEXT_INDEX.source_searcher(not appsettings.OFFLOAD_INDEXING) with index as searcher: if source: results = self.__search(searcher, "source", SOURCE_SCHEMA, query) ret = ret.union(results) if context: results = self.__search(searcher, "context", SOURCE_SCHEMA, query) ret = ret.union(results) # Search in target if translation: sample = self.all()[0] index = FULLTEXT_INDEX.target_searcher(sample.translation.language.code, not appsettings.OFFLOAD_INDEXING) with index as searcher: results = self.__search(searcher, "target", TARGET_SCHEMA, query) ret = ret.union(results) if checksums: return ret return self.filter(checksum__in=ret)
def fulltext(self, query, source=True, context=True, translation=True, checksums=False): ''' Performs full text search on defined set of fields. Returns queryset unless checksums is set. ''' ret = set() # Search in source or context if source or context: index = FULLTEXT_INDEX.source_searcher( not appsettings.OFFLOAD_INDEXING ) with index as searcher: if source: results = self.__search( searcher, 'source', SOURCE_SCHEMA, query ) ret = ret.union(results) if context: results = self.__search( searcher, 'context', SOURCE_SCHEMA, query ) ret = ret.union(results) # Search in target if translation: sample = self.all()[0] index = FULLTEXT_INDEX.target_searcher( sample.translation.language.code, not appsettings.OFFLOAD_INDEXING ) with index as searcher: results = self.__search( searcher, 'target', TARGET_SCHEMA, query ) ret = ret.union(results) if checksums: return ret return self.filter(checksum__in=ret)
def fulltext(self, query, source=True, context=True, translation=True, checksums=False): ''' Performs full text search on defined set of fields. Returns queryset unless checksums is set. ''' ret = set() # Search in source or context if source or context: index = FULLTEXT_INDEX.source_searcher() with index as searcher: if source: results = self.__search( searcher, 'source', SOURCE_SCHEMA, query ) ret = ret.union(results) if context: results = self.__search( searcher, 'context', SOURCE_SCHEMA, query ) ret = ret.union(results) # Search in target if translation: sample = self.all()[0] index = FULLTEXT_INDEX.target_searcher( sample.translation.language.code, ) with index as searcher: results = self.__search( searcher, 'target', TARGET_SCHEMA, query ) ret = ret.union(results) if checksums: return ret return self.filter(checksum__in=ret)
def same_source(self, unit): ''' Finds units with same source. ''' index = FULLTEXT_INDEX.source_searcher() source_string = unit.get_source_plurals()[0] parser = qparser.QueryParser('source', SOURCE_SCHEMA) parsed = parser.parse(source_string) checksums = set() with index as searcher: # Search for same string results = searcher.search(parsed) for result in results: checksums.add(result['checksum']) return self.filter( checksum__in=checksums, translation__language=unit.translation.language, translated=True ).exclude( pk=unit.id )
def similar(self, unit): """ Finds similar units to current unit. """ ret = set([unit.checksum]) index = FULLTEXT_INDEX.source_searcher(not appsettings.OFFLOAD_INDEXING) with index as searcher: # Extract up to 10 terms from the source key_terms = searcher.key_terms_from_text("source", unit.source, numterms=10) terms = [kw[0] for kw in key_terms if not kw in IGNORE_SIMILAR] cnt = len(terms) # Try to find at least configured number of similar strings, # remove up to 4 words while len(ret) < appsettings.SIMILAR_MESSAGES and cnt > 0 and len(terms) - cnt < 4: for search in itertools.combinations(terms, cnt): results = self.search(" ".join(search), True, False, False, True) ret = ret.union(results) cnt -= 1 project = unit.translation.subproject.project return self.filter( translation__subproject__project=project, translation__language=unit.translation.language, checksum__in=ret ).exclude(target__in=["", unit.target])
def more_like_this(self, unit): ''' Finds closely similar units. ''' index = FULLTEXT_INDEX.source_searcher( not appsettings.OFFLOAD_INDEXING ) source_string = unit.get_source_plurals()[0] parser = qparser.QueryParser('source', SOURCE_SCHEMA) parsed = parser.parse(source_string) checksums = set() with index as searcher: # Search for same string results = searcher.search(parsed) if len(results) == 0: return self.none() first_hit = results[0] # Find similar results to first one more_results = first_hit.more_like_this( 'source', source_string, 500 ) # Include all more like this results for result in more_results: checksums.add(result['checksum']) # Remove all original matches for result in results: checksums.discard(result['checksum']) return self.filter( checksum__in=checksums, translation__language=unit.translation.language, translated=True ).exclude( pk=unit.id )
def same_source(self, unit): ''' Finds units with same source. ''' index = FULLTEXT_INDEX.source_searcher( not appsettings.OFFLOAD_INDEXING ) source_string = unit.get_source_plurals()[0] parser = qparser.QueryParser('source', SOURCE_SCHEMA) parsed = parser.parse(source_string) checksums = set() with index as searcher: # Search for same string results = searcher.search(parsed) for result in results: checksums.add(result['checksum']) return self.filter( checksum__in=checksums, translation__language=unit.translation.language, translated=True ).exclude( pk=unit.id )