def _get_tokens(self, method_name): getter_setter = method_name.startswith('get') or \ method_name.startswith('set') token_size = len(su.tokenize(method_name)) if getter_setter: token_size -= 1 return token_size
def test_tokenize(self): inputs = [ ('Foo', ['Foo']), ('foo', ['foo']), ('fooBar', ['foo', 'Bar']), ('BarFooBaz', ['Bar', 'Foo', 'Baz']), ] for i in inputs: self.assertEqual(i[1], su.tokenize(i[0]))
def _get_potentials_by_similarity(self, potentials, fqn_container): new_potentials = [] max_similarity = 0.0 (container_simple, _) = je.clean_java_name(fqn_container) container_tokens = [ token.lower() for token in su.tokenize(container_simple) ] container_simple_lower = container_simple.lower() similarities = [] for potential in potentials: (simple, _) = je.clean_java_name(get_container(potential).fqn) potential_tokens = [token.lower() for token in su.tokenize(simple)] simple_lower = simple.lower() common_token = self._get_common_token_ratio( container_tokens, potential_tokens) psimilarity = su.pairwise_simil(container_simple_lower, simple_lower) # This is the minimum required by this filter: if common_token == 0.0 or psimilarity < self.PAIRWISE_THRESHOLD: continue similarity = max(common_token, psimilarity) if similarity > max_similarity: max_similarity = similarity similarities.append((potential, similarity)) # Only keep the elements that match the threshold # Or accept elements that are fuzzily near the max_similarity if max_similarity < self.HIGH_SIMILARITY: max_similarity = max_similarity - self.DIFFERENCE_THRESHOLD for (potential, similarity) in similarities: if similarity >= max_similarity: new_potentials.append(potential) return new_potentials
def _get_potentials_by_similarity(self, potentials, fqn_container): new_potentials = [] max_similarity = 0.0 (container_simple, _) = je.clean_java_name(fqn_container) container_tokens = [token.lower() for token in su.tokenize(container_simple)] container_simple_lower = container_simple.lower() similarities = [] for potential in potentials: (simple, _) = je.clean_java_name(get_container(potential).fqn) potential_tokens = [token.lower() for token in su.tokenize(simple)] simple_lower = simple.lower() common_token = self._get_common_token_ratio(container_tokens, potential_tokens) psimilarity = su.pairwise_simil(container_simple_lower, simple_lower) # This is the minimum required by this filter: if common_token == 0.0 or psimilarity < self.PAIRWISE_THRESHOLD: continue similarity = max(common_token, psimilarity) if similarity > max_similarity: max_similarity = similarity similarities.append((potential, similarity)) # Only keep the elements that match the threshold # Or accept elements that are fuzzily near the max_similarity if max_similarity < self.HIGH_SIMILARITY: max_similarity = max_similarity - self.DIFFERENCE_THRESHOLD for (potential, similarity) in similarities: if similarity >= max_similarity: new_potentials.append(potential) return new_potentials
def compute_code_words(codebase): d = enchant.Dict('en-US') elements = CodeElement.objects.\ filter(codebase=codebase).\ filter(kind__is_type=True).\ iterator() code_words = set() for element in elements: simple_name = element.simple_name tokens = tokenize(simple_name) if len(tokens) > 1: code_words.add(simple_name.lower()) else: simple_name = simple_name.lower() if not d.check(simple_name): code_words.add(simple_name) logger.debug('Computed {0} code words for codebase {1}'.format( len(code_words), str(codebase))) return code_words
def test_tokenize(self): inputs = [("Foo", ["Foo"]), ("foo", ["foo"]), ("fooBar", ["foo", "Bar"]), ("BarFooBaz", ["Bar", "Foo", "Baz"])] for i in inputs: self.assertEqual(i[1], su.tokenize(i[0]))