def test_null_matches(self): matches = matchers.best_match('', US_STATES, top_n=2) self.assertEqual(len(matches), 2) matches = matchers.best_match('nothing', [], top_n=5) self.assertEqual([], matches) matches = matchers.best_match(None, [], top_n=5) self.assertEqual([], matches)
def test_case_insensitivity(self): """Make sure we disregard case when doing comparisons.""" fake_comp = 'TeST' fake_categories = ['test', 'thing', 'face'] match, percent = matchers.best_match( fake_comp, fake_categories, top_n=1 )[0] self.assertEqual(match, 'test') self.assertEqual(percent, 100)
def test_multiple_matches(self): """tests that multiple matches come back""" state = 'Ilinois' matches = matchers.best_match(state, US_STATES, top_n=6) self.assertEqual(len(matches), 6) first_match = matches[0] second_match = matches[1] self.assertEqual(first_match[0], 'illinois') self.assertGreater(first_match[1], 90) self.assertLess(second_match[1], 90)
def test_multiple_matches(self): """tests that multiple matches come back""" state = 'Ilinois' matches = matchers.best_match(state, US_STATES, top_n=6) self.assertEqual(len(matches), 6) first_match = matches[0] second_match = matches[1] self.assertEqual(first_match[1], 'illinois') self.assertGreater(first_match[2], 90) self.assertLess(second_match[2], 90)
def test_case_insensitivity(self): """Make sure we disregard case when doing comparisons.""" fake_comp = 'TeST' fake_categories = ['test', 'thing', 'face'] match, percent = matchers.best_match(fake_comp, fake_categories, top_n=1)[0] self.assertEqual(match, 'test') self.assertEqual(percent, 100)
def __init__(self, raw_columns, dest_columns, previous_mapping=None, map_args=None, threshold=0): """ :param raw_columns: list of str. The column names we're trying to map. :param dest_columns: list of str. The columns we're mapping to. :param previous_mapping: Method that contains previous mapped columns :param map_args: .. todo: document as I have no idea what this is doing. :param thresh: int, Minimum value of the matching confidence to allow for matching. """ self.data = {} for raw in raw_columns: attempt_best_match = False # We want previous mappings to be at the top of the list. if previous_mapping and callable(previous_mapping): args = map_args or [] # Mapping will look something like this -- [u'table', u'field', 100] mapping = previous_mapping(raw, *args) if mapping: self.add_mappings(raw, [mapping], True) else: attempt_best_match = True else: attempt_best_match = True # Only enter this flow if we haven't already selected a result. Ignore blank columns # with conf of 100 since a conf of 100 signifies the user has saved that mapping. if attempt_best_match: # convert raw fields spaces into underscores because that is what is in the database raw_test = raw.replace(' ', '_') # try some alternatives to the raw column in specific cases # (e.g. zip => postal code). Hack for now, but should make this some global # config or organization specific config if raw_test.lower() == 'zip' or raw_test.lower() == 'zip_code': raw_test = 'postal_code' if raw_test.lower() == 'gba': raw_test = 'gross_floor_area' if raw_test.lower() == 'building_address': raw_test = 'address_line_1' matches = matchers.best_match(raw_test, dest_columns, top_n=5) # go get the top 5 matches. format will be [('PropertyState', 'building_count', 62), ...] self.add_mappings(raw, matches) # convert this to an exception and catch it some day... index = 0 while self.duplicates and index < 10: index += 1 _log.debug("Index: {} with duplicates: {}".format(index, self.duplicates)) for k, v in self.duplicates.iteritems(): self.resolve_duplicate(k, v) if threshold > 0: self.apply_threshold(threshold)
def __init__(self, raw_columns, dest_columns, previous_mapping=None, map_args=None, default_mappings=None, threshold=0): """ :param raw_columns: list of str. The column names we're trying to map. :param dest_columns: list of str. The columns we're mapping to. :param previous_mapping: Method that contains previous mapped columns .. code: The expectation is that our callable always gets passed a raw key. If it finds a match, it returns the raw_column and score. previous_mapping('example field', *map_args) -> ('field_1', 0.93) :param map_args: Arguments to pass into the previous_mapping method (e.g. Organization ID) :param default_mappings: dict of mappings. Use these mappings if the column is not found in the previous mapping call :param threshold: int, Minimum value of the matching confidence to allow for matching. :return dict: {'raw_column': ('dest_column', score), 'raw_column_2': ('dest_column_2',...)} """ self.data = {} for raw in raw_columns: attempt_best_match = False # We want previous mappings to be at the top of the list. if previous_mapping and callable(previous_mapping): args = map_args or [] # Mapping will look something like this -- ['table', 'field', 100] mapping = previous_mapping(raw, *args) if mapping: self.add_mappings(raw, [mapping], True) elif default_mappings and raw in default_mappings: self.add_mappings(raw, [default_mappings[raw]], True) else: attempt_best_match = True else: attempt_best_match = True # Only enter this flow if we haven't already selected a result. Ignore blank columns # with conf of 100 since a conf of 100 signifies the user has saved that mapping. if attempt_best_match: # convert raw fields spaces into underscores because that is what is in the database raw_test = raw.replace(' ', '_') # try some alternatives to the raw column in specific cases # (e.g. zip => postal code). Hack for now, but should make this some global # config or organization specific config if raw_test.lower() == 'zip' or raw_test.lower() == 'zip_code': raw_test = 'postal_code' if raw_test.lower() == 'gba': raw_test = 'gross_floor_area' if raw_test.lower() == 'building_address': raw_test = 'address_line_1' if raw_test.lower() == 'ubi': raw_test = 'jurisdiction_tax_lot_id' matches = matchers.best_match(raw_test, dest_columns, top_n=5) # go get the top 5 matches. format will be [('PropertyState', 'building_count', 62), ...] self.add_mappings(raw, matches) # convert this to an exception and catch it some day. index = 0 while self.duplicates and index < 10: index += 1 _log.debug("Index: %s with duplicates: %s" % (index, self.duplicates)) for k, v in self.duplicates.items(): self.resolve_duplicate(k, v) if threshold > 0: self.apply_threshold(threshold)