def test(self): #self.assertEqual( sa.parse( addr ), addr_parsed ) addr_parsed = sa.parse(addr) assert addr_parsed, 'Could not parse address "{}"'.format(addr) assert addr_parsed == addr_parsed_validator, ( addr + "\n" + str(diff(addr_parsed, addr_parsed_validator)))
def ReadFastTxt(self, txtfile, Print=False): f = open(txtfile, 'r') #addr_list = [] #count = 1 for line in iter(f): #if count==0: # break if len(line) < 200: line = line.split(':') for subline in line: addr = sa.parse(subline) if (addr is not None and len(addr)>=5) and \ ('street' in addr.keys() and len( addr['street'] )<=34) and \ ('number' in addr.keys()) and \ ('city' in addr.keys()) and \ ('state' in addr.keys()): #(addr not in addr_list): #addr_list.append(addr) return addr
def readData(input_file, prefix=None): """ Read in our data from a CSV file and create a dictionary of records, where the key is a unique record ID and each value is a [frozendict](http://code.activestate.com/recipes/414283-frozen-dictionaries/) (hashable dictionary) of the row fields. **Currently, dedupe depends upon records' unique ids being integers with no integers skipped. The smallest valued unique id must be 0 or 1. Expect this requirement will likely be relaxed in the future.** """ data = {} reader = csv.DictReader(StringIO(input_file)) for i, row in enumerate(reader): clean_row = dict((k, preProcess(v)) for (k, v) in row.items()) parsed_address = streetaddress.parse(clean_row['Address'] + ', chicago, il') if parsed_address : clean_row['Address Parsed'] = 1 else : clean_row['Address Parsed'] = 0 parsed_address = {} key_components = {'number' : '', 'prefix' : '', 'street' : '', 'type' : ''} key_components.update(parsed_address) clean_row.update(key_components) if prefix : row_id = (prefix, i) else : row_id = i data[row_id] = dedupe.core.frozendict(clean_row) return data
def test_all(self): assert sa.parse('1005 Gravenstein Hwy 95472') == { 'number': '1005', 'street': 'Gravenstein', 'zip': '95472', 'type': 'Hwy' } assert sa.parse('1005 Gravenstein Hwy, 95472') == { 'number': '1005', 'street': 'Gravenstein', 'zip': '95472', 'type': 'Hwy' } assert sa.parse('1005 Gravenstein Hwy N, 95472') == { 'number': '1005', 'street': 'Gravenstein', 'zip': '95472', 'type': 'Hwy', 'suffix': 'N' } assert sa.parse('1005 Gravenstein Highway North, 95472') == { 'number': '1005', 'street': 'Gravenstein', 'zip': '95472', 'type': 'Hwy', 'suffix': 'N' } assert sa.parse('1005 N Gravenstein Highway, Sebastopol, CA') == { 'number': '1005', 'street': 'Gravenstein', 'type': 'Hwy', 'prefix': 'N', 'city': 'Sebastopol', 'state': 'CA' } assert sa.parse( "1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'type': 'Hwy', 'prefix': 'N', 'sec_unit_type': 'Suite', 'sec_unit_num': '500', } assert sa.parse( "1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'type': 'Hwy', 'prefix': 'N', 'sec_unit_type': 'Suite', 'sec_unit_num': '500', } assert sa.parse("1005 N Gravenstein Hwy Suite 500 Sebastopol, CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'type': 'Hwy', 'prefix': 'N', 'sec_unit_type': 'Suite', 'sec_unit_num': '500', } assert sa.parse( "1005 N Gravenstein Highway, Sebastopol, CA, 95472") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'zip': '95472', 'type': 'Hwy', 'prefix': 'N' } assert sa.parse("1005 N Gravenstein Highway Sebastopol CA 95472") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'zip': '95472', 'type': 'Hwy', 'prefix': 'N' } assert sa.parse("1005 Gravenstein Hwy N Sebastopol CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'suffix': 'N', 'type': 'Hwy', } assert sa.parse("1005 Gravenstein Hwy N, Sebastopol CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'suffix': 'N', 'type': 'Hwy', } assert sa.parse("1005 Gravenstein Hwy, N Sebastopol CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'North Sebastopol', 'type': 'Hwy', } assert sa.parse("1005 Gravenstein Hwy, North Sebastopol CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'North Sebastopol', 'type': 'Hwy', } assert sa.parse("1005 Gravenstein Hwy Sebastopol CA") == { 'number': '1005', 'street': 'Gravenstein', 'state': 'CA', 'city': 'Sebastopol', 'type': 'Hwy', } assert sa.parse("115 Broadway San Francisco CA") == { 'number': '115', 'street': 'Broadway', 'state': 'CA', 'city': 'San Francisco', } assert sa.parse("7800 Mill Station Rd, Sebastopol, CA 95472") == { 'number': '7800', 'street': 'Mill Station', 'state': 'CA', 'city': 'Sebastopol', 'zip': '95472', 'type': 'Rd', } assert sa.parse("7800 Mill Station Rd Sebastopol CA 95472") == { 'number': '7800', 'street': 'Mill Station', 'state': 'CA', 'city': 'Sebastopol', 'zip': '95472', 'type': 'Rd', } assert sa.parse("1005 State Highway 116 Sebastopol CA 95472") == { 'number': '1005', 'street': 'State Highway 116', 'state': 'CA', 'city': 'Sebastopol', 'zip': '95472', 'type': 'Hwy', } assert sa.parse("1600 Pennsylvania Ave. Washington DC") == { 'number': '1600', 'street': 'Pennsylvania', 'state': 'DC', 'city': 'Washington', 'type': 'Ave', } assert sa.parse("1600 Pennsylvania Avenue Washington DC") == { 'number': '1600', 'street': 'Pennsylvania', 'state': 'DC', 'city': 'Washington', 'type': 'Ave', } assert sa.parse("48S 400E, Salt Lake City UT") == { 'type': '', 'number': '48', 'street': '400', 'state': 'UT', 'city': 'Salt Lake City', 'suffix': 'E', 'prefix': 'S' } assert sa.parse("550 S 400 E #3206, Salt Lake City UT 84111") == { 'number': '550', 'street': '400', 'state': 'UT', 'sec_unit_num': '3206', 'zip': '84111', 'city': 'Salt Lake City', 'suffix': 'E', 'type': '', 'sec_unit_type': '#', 'prefix': 'S' } assert sa.parse("6641 N 2200 W Apt D304 Park City, UT 84098") == { 'number': '6641', 'street': '2200', 'state': 'UT', 'sec_unit_num': 'D304', 'zip': '84098', 'city': 'Park City', 'suffix': 'W', 'type': '', 'sec_unit_type': 'Apt', 'prefix': 'N' } assert sa.parse("100 South St, Philadelphia, PA") == { 'number': '100', 'street': 'South', 'state': 'PA', 'city': 'Philadelphia', 'type': 'St', } assert sa.parse("100 S.E. Washington Ave, Minneapolis, MN") == { 'number': '100', 'street': 'Washington', 'state': 'MN', 'city': 'Minneapolis', 'type': 'Ave', 'prefix': 'SE' } assert sa.parse("3813 1/2 Some Road, Los Angeles, CA") == { 'number': '3813', 'street': 'Some', 'state': 'CA', 'city': 'Los Angeles', 'type': 'Rd', } assert sa.parse("Mission & Valencia San Francisco CA") == { 'type1': '', 'type2': '', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse("Mission & Valencia, San Francisco CA") == { 'type1': '', 'type2': '', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse("Mission St and Valencia St San Francisco CA") == { 'type1': 'St', 'type2': 'St', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse("Mission St & Valencia St San Francisco CA") == { 'type1': 'St', 'type2': 'St', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse("Mission and Valencia Sts San Francisco CA") == { 'type1': 'St', 'type2': 'St', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse("Mission & Valencia Sts. San Francisco CA") == { 'type1': 'St', 'type2': 'St', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse("Mission & Valencia Streets San Francisco CA") == { 'type1': 'St', 'type2': 'St', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse( "Mission Avenue and Valencia Street San Francisco CA") == { 'type1': 'Ave', 'type2': 'St', 'street1': 'Mission', 'state': 'CA', 'city': 'San Francisco', 'street2': 'Valencia' } assert sa.parse("1 First St, e San Jose CA") == { # lower case city direction 'number' : '1', 'street' : 'First', 'state' : 'CA', 'city' : 'East San Jose', 'type' : 'St', } assert sa.parse("123 Maple Rochester, New York") == { # space in state name 'type' : '', 'number' : '123', 'street' : 'Maple', 'state' : 'NY', 'city' : 'Rochester', } assert sa.parse("233 S Wacker Dr 60606-6306") == { # zip+4 with hyphen 'number': '233', 'street': 'Wacker', 'zip': '60606', 'type': 'Dr', 'prefix': 'S' } assert sa.parse( "233 S Wacker Dr 606066306") == { # zip+4 without hyphen 'number': '233', 'street': 'Wacker', 'zip': '60606', 'type': 'Dr', 'prefix': 'S' } assert sa.parse("233 S Wacker Dr lobby 60606") == { # unnumbered secondary unit type 'number' : '233', 'street' : 'Wacker', 'zip' : '60606', 'type' : 'Dr', 'prefix' : 'S', 'sec_unit_type' : 'lobby', } assert sa.parse("(233 S Wacker Dr lobby 60606)") == { # surrounding punctuation 'number' : '233', 'street' : 'Wacker', 'zip' : '60606', 'type' : 'Dr', 'prefix' : 'S', 'sec_unit_type' : 'lobby', } assert sa.parse("#42 233 S Wacker Dr 60606" ) == { # leading numbered secondary unit type 'sec_unit_num': '42', 'zip': '60606', 'number': '233', 'street': 'Wacker', 'sec_unit_type': '#', 'type': 'Dr', 'prefix': 'S' } assert sa.parse("lt42 99 Some Road, Some City LA" ) == { # no space before sec_unit_num 'sec_unit_num': '42', 'city': 'Some City', 'number': '99', 'street': 'Some', 'sec_unit_type': 'lt', 'type': 'Rd', 'state': 'LA' } assert sa.parse("36401 County Road 43, Eaton, CO 80615" ) == { # numbered County Road 'city': 'Eaton', 'zip': '80615', 'number': '36401', 'street': 'County Road 43', 'type': 'Rd', 'state': 'CO' } assert sa.parse("1234 COUNTY HWY 60E, Town, CO 12345") == { 'city': 'Town', 'zip': '12345', 'number': '1234', 'street': 'COUNTY HWY 60', 'suffix': 'E', 'type': '', # ? 'state': 'CO' }
def ReadSlowTxt(self, txtfile, Print=False): state = 0 count = 0 total_attempts = 3 good_addr = '' prev_one_line = '' prev_two_line = '' f = open(txtfile, 'r') for line in iter(f): #if Print is True: # print line prev_two_line = prev_one_line prev_one_line = line if count < 0: state = 0 if len(line) < 200: print str(state) + '|' + line + '|' + good_addr if state == 0: line = line.split(':') for subline in line: addr = sa.parse(subline) #print addr if (addr is not None) and \ ('number' in addr.keys()) and \ ('street' in addr.keys()) and \ len(addr['street'])<=34: if ('city' not in addr.keys()) and \ ('state' not in addr.keys()): good_addr = subline.replace("\r\n", "") #print 'good: '+good_addr #print 'go to 1' state = 1 count = total_attempts break if 'state' not in addr.keys(): good_addr = subline.replace("\r\n", "") #print 'good: '+good_addr #print 'go to 2' state = 2 else: return addr break else: line = line.split() tags = self.st.tag(line) subline = good_addr #print tags #sub = [] #sub.append(good_addr) for tag in tags: if tag[1] == 'LOCATION': subline += ' ' + tag[0].replace("\r\n", "").encode('ascii') #sub.append(tag[0].replace("\r\n","")) #subline=''.join(sub) #print state, subline if state == 1: addr = sa.parse(subline) #print state, addr if (addr is not None) and \ ('number' in addr.keys()) and \ ('street' in addr.keys()) and \ ('city' in addr.keys()) and \ ('state' in addr.keys()) and \ len(addr['street'])<=34: return addr else: state = 2 good_addr = subline count -= 1 #print 'go to 2' continue if state == 2: addr = sa.parse(subline) #print state, addr if (addr is not None) and \ ('number' in addr.keys()) and \ ('street' in addr.keys()) and \ ('city' in addr.keys()) and \ len(addr['street'])<=34: return addr else: count -= 1 continue
def test_all(self) : assert sa.parse('1005 Gravenstein Hwy 95472') == {'number' :'1005', 'street' : 'Gravenstein', 'zip' : '95472', 'type' :'Hwy' } assert sa.parse('1005 Gravenstein Hwy, 95472') == {'number' :'1005', 'street' : 'Gravenstein', 'zip' : '95472', 'type' :'Hwy' } assert sa.parse('1005 Gravenstein Hwy N, 95472') == {'number' :'1005', 'street' : 'Gravenstein', 'zip' : '95472', 'type' :'Hwy', 'suffix' : 'N' } assert sa.parse('1005 Gravenstein Highway North, 95472') == {'number' :'1005', 'street' : 'Gravenstein', 'zip' : '95472', 'type' :'Hwy', 'suffix' : 'N' } assert sa.parse('1005 N Gravenstein Highway, Sebastopol, CA') == {'number' :'1005', 'street' : 'Gravenstein', 'type' :'Hwy', 'prefix' : 'N', 'city' : 'Sebastopol', 'state' : 'CA' } assert sa.parse("1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'type' : 'Hwy', 'prefix' : 'N', 'sec_unit_type' : 'Suite', 'sec_unit_num' : '500', } assert sa.parse("1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'type' : 'Hwy', 'prefix' : 'N', 'sec_unit_type' : 'Suite', 'sec_unit_num' : '500', } assert sa.parse("1005 N Gravenstein Hwy Suite 500 Sebastopol, CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'type' : 'Hwy', 'prefix' : 'N', 'sec_unit_type' : 'Suite', 'sec_unit_num' : '500', } assert sa.parse("1005 N Gravenstein Highway, Sebastopol, CA, 95472") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'zip' : '95472', 'type' : 'Hwy', 'prefix' : 'N' } assert sa.parse("1005 N Gravenstein Highway Sebastopol CA 95472") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'zip' : '95472', 'type' : 'Hwy', 'prefix' : 'N' } assert sa.parse("1005 Gravenstein Hwy N Sebastopol CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'suffix' : 'N', 'type' : 'Hwy', } assert sa.parse("1005 Gravenstein Hwy N, Sebastopol CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'suffix' : 'N', 'type' : 'Hwy', } assert sa.parse("1005 Gravenstein Hwy, N Sebastopol CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'North Sebastopol', 'type' : 'Hwy', } assert sa.parse("1005 Gravenstein Hwy, North Sebastopol CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'North Sebastopol', 'type' : 'Hwy', } assert sa.parse("1005 Gravenstein Hwy Sebastopol CA") == { 'number' : '1005', 'street' : 'Gravenstein', 'state' : 'CA', 'city' : 'Sebastopol', 'type' : 'Hwy', } assert sa.parse("115 Broadway San Francisco CA") == { 'number' : '115', 'street' : 'Broadway', 'state' : 'CA', 'city' : 'San Francisco', } assert sa.parse("7800 Mill Station Rd, Sebastopol, CA 95472") == { 'number' : '7800', 'street' : 'Mill Station', 'state' : 'CA', 'city' : 'Sebastopol', 'zip' : '95472', 'type' : 'Rd', } assert sa.parse("7800 Mill Station Rd Sebastopol CA 95472") == { 'number' : '7800', 'street' : 'Mill Station', 'state' : 'CA', 'city' : 'Sebastopol', 'zip' : '95472', 'type' : 'Rd', } assert sa.parse("1005 State Highway 116 Sebastopol CA 95472") == { 'number' : '1005', 'street' : 'State Highway 116', 'state' : 'CA', 'city' : 'Sebastopol', 'zip' : '95472', 'type' : 'Hwy', } assert sa.parse("1600 Pennsylvania Ave. Washington DC") == { 'number' : '1600', 'street' : 'Pennsylvania', 'state' : 'DC', 'city' : 'Washington', 'type' : 'Ave', } assert sa.parse("1600 Pennsylvania Avenue Washington DC") == { 'number' : '1600', 'street' : 'Pennsylvania', 'state' : 'DC', 'city' : 'Washington', 'type' : 'Ave', } assert sa.parse("48S 400E, Salt Lake City UT") == { 'type' : '', 'number' : '48', 'street' : '400', 'state' : 'UT', 'city' : 'Salt Lake City', 'suffix' : 'E', 'prefix' : 'S' } assert sa.parse("550 S 400 E #3206, Salt Lake City UT 84111") == { 'number' : '550', 'street' : '400', 'state' : 'UT', 'sec_unit_num' : '3206', 'zip' : '84111', 'city' : 'Salt Lake City', 'suffix' : 'E', 'type' : '', 'sec_unit_type' : '#', 'prefix' : 'S' } assert sa.parse("6641 N 2200 W Apt D304 Park City, UT 84098") == { 'number' : '6641', 'street' : '2200', 'state' : 'UT', 'sec_unit_num' : 'D304', 'zip' : '84098', 'city' : 'Park City', 'suffix' : 'W', 'type' : '', 'sec_unit_type' : 'Apt', 'prefix' : 'N' } assert sa.parse("100 South St, Philadelphia, PA") == { 'number' : '100', 'street' : 'South', 'state' : 'PA', 'city' : 'Philadelphia', 'type' : 'St', } assert sa.parse("100 S.E. Washington Ave, Minneapolis, MN") == { 'number' : '100', 'street' : 'Washington', 'state' : 'MN', 'city' : 'Minneapolis', 'type' : 'Ave', 'prefix' : 'SE' } assert sa.parse("3813 1/2 Some Road, Los Angeles, CA") == { 'number' : '3813', 'street' : 'Some', 'state' : 'CA', 'city' : 'Los Angeles', 'type' : 'Rd', } assert sa.parse("Mission & Valencia San Francisco CA") == { 'type1' : '', 'type2' : '', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("Mission & Valencia, San Francisco CA") == { 'type1' : '', 'type2' : '', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("Mission St and Valencia St San Francisco CA") == { 'type1' : 'St', 'type2' : 'St', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("Mission St & Valencia St San Francisco CA") == { 'type1' : 'St', 'type2' : 'St', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("Mission and Valencia Sts San Francisco CA") == { 'type1' : 'St', 'type2' : 'St', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("Mission & Valencia Sts. San Francisco CA") == { 'type1' : 'St', 'type2' : 'St', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("Mission & Valencia Streets San Francisco CA") == { 'type1' : 'St', 'type2' : 'St', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("Mission Avenue and Valencia Street San Francisco CA") == { 'type1' : 'Ave', 'type2' : 'St', 'street1' : 'Mission', 'state' : 'CA', 'city' : 'San Francisco', 'street2' : 'Valencia' } assert sa.parse("1 First St, e San Jose CA") == { # lower case city direction 'number' : '1', 'street' : 'First', 'state' : 'CA', 'city' : 'East San Jose', 'type' : 'St', } assert sa.parse("123 Maple Rochester, New York") == { # space in state name 'type' : '', 'number' : '123', 'street' : 'Maple', 'state' : 'NY', 'city' : 'Rochester', } assert sa.parse("233 S Wacker Dr 60606-6306") == { # zip+4 with hyphen 'number' : '233', 'street' : 'Wacker', 'zip' : '60606', 'type' : 'Dr', 'prefix' : 'S' } assert sa.parse("233 S Wacker Dr 606066306") == { # zip+4 without hyphen 'number' : '233', 'street' : 'Wacker', 'zip' : '60606', 'type' : 'Dr', 'prefix' : 'S' } assert sa.parse("233 S Wacker Dr lobby 60606") == { # unnumbered secondary unit type 'number' : '233', 'street' : 'Wacker', 'zip' : '60606', 'type' : 'Dr', 'prefix' : 'S', 'sec_unit_type' : 'lobby', } assert sa.parse("(233 S Wacker Dr lobby 60606)") == { # surrounding punctuation 'number' : '233', 'street' : 'Wacker', 'zip' : '60606', 'type' : 'Dr', 'prefix' : 'S', 'sec_unit_type' : 'lobby', } assert sa.parse("#42 233 S Wacker Dr 60606") == { # leading numbered secondary unit type 'sec_unit_num' : '42', 'zip' : '60606', 'number' : '233', 'street' : 'Wacker', 'sec_unit_type' : '#', 'type' : 'Dr', 'prefix' : 'S' } assert sa.parse("lt42 99 Some Road, Some City LA") == { # no space before sec_unit_num 'sec_unit_num' : '42', 'city' : 'Some City', 'number' : '99', 'street' : 'Some', 'sec_unit_type' : 'lt', 'type' : 'Rd', 'state' : 'LA' } assert sa.parse("36401 County Road 43, Eaton, CO 80615") == { # numbered County Road 'city' : 'Eaton', 'zip' : '80615', 'number' : '36401', 'street' : 'County Road 43', 'type' : 'Rd', 'state' : 'CO' } assert sa.parse("1234 COUNTY HWY 60E, Town, CO 12345") == { 'city' : 'Town', 'zip' : '12345', 'number' : '1234', 'street' : 'COUNTY HWY 60', 'suffix' : 'E', 'type' : '', # ? 'state' : 'CO' }
def execute(self, search_input, user_agent): # Google maps geolocation appends 'USA' but the address parser can't cope search_input = search_input.replace('USA','') addr = streetaddress.parse(search_input) if addr is None: # Since we are so tightly coupled with Appleton data, let's just pacify the address parser addr = streetaddress.parse(search_input + ' Appleton, WI') housenumber = addr['number'] # Handle upstream requirement of "Fifth" not "5th" p = inflect.engine() if contains_digits(addr['street']): street = p.number_to_words(addr['street']) else: street = addr['street'] if not housenumber and not street: return { 'error' : 'Give me *SOMETHING* to search for.'} try: response = urllib2.urlopen('http://my.appleton.org/') for line in response: if "__VIEWSTATE\"" in line: vs = extracttagvalues(line) if "__EVENTVALIDATION\"" in line: ev = extracttagvalues(line) formvalues = { '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': vs, '__EVENTVALIDATION': ev, 'ctl00$myappletonContent$txtStreetNumber': housenumber, 'ctl00$myappletonContent$txtStreetName': street, 'ctl00$myappletonContent$btnSubmit': 'Submit'} headers = { 'User-Agent': user_agent, 'Referer': 'http://my.appleton.org/default.aspx', 'Accept': 'text/html,application/xhtml+xml,application/xml' } data = urllib.urlencode(formvalues) req = urllib2.Request("http://my.appleton.org/default.aspx", data, headers) response = urllib2.urlopen(req) allresults = [] # Example of the HTML returned... # <a id="ctl00_myappletonContent_searchResults_ctl03_PropKey" # href="Propdetail.aspx?PropKey=312039300&Num=100">312039300 </a> # </td><td>100</td><td>E WASHINGTON ST </td> for pline in response: if "Propdetail.aspx?PropKey=" in pline: searchresult = [] m = re.search('(?<=PropKey\=).*(?=&)', pline) if m: searchresult.append(re.split('PropKey=', m.group(0))[0]) m = re.findall('(?s)<td>(.*?)</td>', response.next()) if m: # this removes whitespace and Title Cases the address # given: <td>1200</td><td>W WISCONSIN AVE </td> # returns: ['1200', 'W Wisconsin Ave'] address = [' '.join(t.split()).strip().title() for t in m] searchresult.append(address[0]) #Number # Thank you Dan Gabrielson <*****@*****.**> and Matt Everson https://github.com/matteverson # for your help at 2015 Appleton Civic Hackathon! This closes https://github.com/mikeputnam/appletonapi/issues/5 label = ' ' for chunk in address[1:]: label += chunk + ' ' searchresult.append(label.strip()) allresults.append(searchresult) return { 'result' : allresults } except urllib2.URLError, e: logging.error('SEARCH FAIL! my.appleton.org up? scrape assumptions still valid?') return { 'error' : "Cannot search :( <br/>" + str(e) }
def test( self ): #self.assertEqual( sa.parse( addr ), addr_parsed ) addr_parsed = sa.parse( addr ) assert addr_parsed, 'Could not parse address "{}"'.format(addr) assert addr_parsed == addr_parsed_validator, ( addr + "\n" + str( diff( addr_parsed, addr_parsed_validator ) ) )
def execute(self, search_input, user_agent): # Google maps geolocation appends 'USA' but the address parser can't cope search_input = search_input.replace('USA', '') addr = streetaddress.parse(search_input) if addr is None: # Since we are so tightly coupled with Appleton data, let's just pacify the address parser addr = streetaddress.parse(search_input + ' Appleton, WI') housenumber = addr['number'] # Handle upstream requirement of "Fifth" not "5th" p = inflect.engine() if contains_digits(addr['street']): street = p.number_to_words(addr['street']) else: street = addr['street'] if not housenumber and not street: return {'error': 'Give me *SOMETHING* to search for.'} try: response = urllib2.urlopen('http://my.appleton.org/') for line in response: if "__VIEWSTATE\"" in line: vs = extracttagvalues(line) if "__EVENTVALIDATION\"" in line: ev = extracttagvalues(line) formvalues = { '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': vs, '__EVENTVALIDATION': ev, 'ctl00$myappletonContent$txtStreetNumber': housenumber, 'ctl00$myappletonContent$txtStreetName': street, 'ctl00$myappletonContent$btnSubmit': 'Submit' } headers = { 'User-Agent': user_agent, 'Referer': 'http://my.appleton.org/default.aspx', 'Accept': 'text/html,application/xhtml+xml,application/xml' } data = urllib.urlencode(formvalues) req = urllib2.Request( "http://my.appleton.org/default.aspx", data, headers) response = urllib2.urlopen(req) allresults = [] # Example of the HTML returned... # <a id="ctl00_myappletonContent_searchResults_ctl03_PropKey" # href="Propdetail.aspx?PropKey=312039300&Num=100">312039300 </a> # </td><td>100</td><td>E WASHINGTON ST </td> for pline in response: if "Propdetail.aspx?PropKey=" in pline: searchresult = [] m = re.search('(?<=PropKey\=).*(?=&)', pline) if m: searchresult.append( re.split('PropKey=', m.group(0))[0]) m = re.findall('(?s)<td>(.*?)</td>', response.next()) if m: # this removes whitespace and Title Cases the address # given: <td>1200</td><td>W WISCONSIN AVE </td> # returns: ['1200', 'W Wisconsin Ave'] address = [ ' '.join(t.split()).strip().title() for t in m ] searchresult.append(address[0]) #Number # Thank you Dan Gabrielson <*****@*****.**> and Matt Everson https://github.com/matteverson # for your help at 2015 Appleton Civic Hackathon! This closes https://github.com/mikeputnam/appletonapi/issues/5 label = ' ' for chunk in address[1:]: label += chunk + ' ' searchresult.append(label.strip()) allresults.append(searchresult) return {'result': allresults} except urllib2.URLError, e: logging.error( 'SEARCH FAIL! my.appleton.org up? scrape assumptions still valid?' ) return {'error': "Cannot search :( <br/>" + str(e)}