def test_strip_spaces(self): res = strip_spaces( remove_stopwords( expand_abbreviations( separate_postcode(normalise(single_line( self.address)))[0]))) self.assertEqual(res, '3BRISLEEAVENUENORTHSHIELDS')
if postcode: addresses = db.addresses.find({'postcode': postcode}) else: res = re.split('\W+', to_match) address = ' '.join(res[:min(len(res), 4)]) addresses = db.addresses.find( {'phonetic': { '$regex': '^' + phonetic(address) }}) to_match = strip_spaces(remove_stopwords(expand_abbreviations(to_match))) best_jaccard = 0 best_match = list() for address in addresses: lines = paf_to_lines(address) line = separate_postcode(normalise(single_line(lines))) line = strip_spaces(remove_stopwords(expand_abbreviations(line[0]))) idx = jaccard_index(to_match, line) if idx > best_jaccard: best_jaccard = idx best_match = list() best_match.append(address) elif idx == best_jaccard: best_match.append(address) if len(best_match) == 1: print(">> BEST MATCH") print(best_match[0]) elif len(best_match) == 0: print(">> NO MATCH") else:
def test_expand_abbreviations(self): res = expand_abbreviations( separate_postcode(normalise(single_line(self.address)))[0]) self.assertEqual(res, '3 THE BRISLEE AVENUE NORTH SHIELDS')
def test_separate_postcode(self): res = separate_postcode(normalise(single_line(self.address))) self.assertEqual(res[0], '3 THE BRISLEE AVE NORTH SHIELDS') self.assertEqual(res[1], 'NE30 2SQ')
def test_normalise(self): res = normalise(single_line(self.address)) self.assertEqual(res, '3 THE BRISLEE AVE NORTH SHIELDS NE30 2SQ')
def test_single_line(self): res = single_line(self.address) self.assertEqual(res, ' 3 The Brislee Ave, North Shields ne30 2sq')