Пример #1
0
 def test(self):
     #self.assertEqual( sa.parse( addr ), addr_parsed )
     addr_parsed = sa.parse(addr)
     assert addr_parsed, 'Could not parse address "{}"'.format(addr)
     assert addr_parsed == addr_parsed_validator, (
         addr + "\n" +
         str(diff(addr_parsed, addr_parsed_validator)))
Пример #2
0
 def ReadFastTxt(self, txtfile, Print=False):
     f = open(txtfile, 'r')
     #addr_list = []
     #count = 1
     for line in iter(f):
         #if count==0:
         #    break
         if len(line) < 200:
             line = line.split(':')
             for subline in line:
                 addr = sa.parse(subline)
                 if (addr is not None and len(addr)>=5) and \
                     ('street' in addr.keys() and len( addr['street'] )<=34) and \
                     ('number' in addr.keys()) and \
                     ('city' in addr.keys()) and \
                     ('state' in addr.keys()):
                     #(addr not in addr_list):
                     #addr_list.append(addr)
                     return addr
Пример #3
0
def readData(input_file, prefix=None):
    """
    Read in our data from a CSV file and create a dictionary of records, 
    where the key is a unique record ID and each value is a 
    [frozendict](http://code.activestate.com/recipes/414283-frozen-dictionaries/) 
    (hashable dictionary) of the row fields.

    **Currently, dedupe depends upon records' unique ids being integers
    with no integers skipped. The smallest valued unique id must be 0 or
    1. Expect this requirement will likely be relaxed in the future.**
    """

    data = {}
    reader = csv.DictReader(StringIO(input_file))
    for i, row in enumerate(reader):
        clean_row = dict((k, preProcess(v)) for (k, v) in row.items())

        parsed_address = streetaddress.parse(clean_row['Address'] 
                                             + ', chicago, il')
        if parsed_address :
            clean_row['Address Parsed'] = 1
        else :
            clean_row['Address Parsed'] = 0
            parsed_address = {}

        key_components = {'number' : '', 'prefix' : '', 
                          'street' : '', 'type' : ''}

        key_components.update(parsed_address)

        clean_row.update(key_components)
        
        if prefix :
            row_id = (prefix, i)
        else :
            row_id = i
        data[row_id] = dedupe.core.frozendict(clean_row)

    return data
Пример #4
0
    def test_all(self):

        assert sa.parse('1005 Gravenstein Hwy 95472') == {
            'number': '1005',
            'street': 'Gravenstein',
            'zip': '95472',
            'type': 'Hwy'
        }

        assert sa.parse('1005 Gravenstein Hwy, 95472') == {
            'number': '1005',
            'street': 'Gravenstein',
            'zip': '95472',
            'type': 'Hwy'
        }

        assert sa.parse('1005 Gravenstein Hwy N, 95472') == {
            'number': '1005',
            'street': 'Gravenstein',
            'zip': '95472',
            'type': 'Hwy',
            'suffix': 'N'
        }
        assert sa.parse('1005 Gravenstein Highway North, 95472') == {
            'number': '1005',
            'street': 'Gravenstein',
            'zip': '95472',
            'type': 'Hwy',
            'suffix': 'N'
        }

        assert sa.parse('1005 N Gravenstein Highway, Sebastopol, CA') == {
            'number': '1005',
            'street': 'Gravenstein',
            'type': 'Hwy',
            'prefix': 'N',
            'city': 'Sebastopol',
            'state': 'CA'
        }

        assert sa.parse(
            "1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == {
                'number': '1005',
                'street': 'Gravenstein',
                'state': 'CA',
                'city': 'Sebastopol',
                'type': 'Hwy',
                'prefix': 'N',
                'sec_unit_type': 'Suite',
                'sec_unit_num': '500',
            }

        assert sa.parse(
            "1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == {
                'number': '1005',
                'street': 'Gravenstein',
                'state': 'CA',
                'city': 'Sebastopol',
                'type': 'Hwy',
                'prefix': 'N',
                'sec_unit_type': 'Suite',
                'sec_unit_num': '500',
            }
        assert sa.parse("1005 N Gravenstein Hwy Suite 500 Sebastopol, CA") == {
            'number': '1005',
            'street': 'Gravenstein',
            'state': 'CA',
            'city': 'Sebastopol',
            'type': 'Hwy',
            'prefix': 'N',
            'sec_unit_type': 'Suite',
            'sec_unit_num': '500',
        }

        assert sa.parse(
            "1005 N Gravenstein Highway, Sebastopol, CA, 95472") == {
                'number': '1005',
                'street': 'Gravenstein',
                'state': 'CA',
                'city': 'Sebastopol',
                'zip': '95472',
                'type': 'Hwy',
                'prefix': 'N'
            }
        assert sa.parse("1005 N Gravenstein Highway Sebastopol CA 95472") == {
            'number': '1005',
            'street': 'Gravenstein',
            'state': 'CA',
            'city': 'Sebastopol',
            'zip': '95472',
            'type': 'Hwy',
            'prefix': 'N'
        }
        assert sa.parse("1005 Gravenstein Hwy N Sebastopol CA") == {
            'number': '1005',
            'street': 'Gravenstein',
            'state': 'CA',
            'city': 'Sebastopol',
            'suffix': 'N',
            'type': 'Hwy',
        }
        assert sa.parse("1005 Gravenstein Hwy N, Sebastopol CA") == {
            'number': '1005',
            'street': 'Gravenstein',
            'state': 'CA',
            'city': 'Sebastopol',
            'suffix': 'N',
            'type': 'Hwy',
        }

        assert sa.parse("1005 Gravenstein Hwy, N Sebastopol CA") == {
            'number': '1005',
            'street': 'Gravenstein',
            'state': 'CA',
            'city': 'North Sebastopol',
            'type': 'Hwy',
        }
        assert sa.parse("1005 Gravenstein Hwy, North Sebastopol CA") == {
            'number': '1005',
            'street': 'Gravenstein',
            'state': 'CA',
            'city': 'North Sebastopol',
            'type': 'Hwy',
        }
        assert sa.parse("1005 Gravenstein Hwy Sebastopol CA") == {
            'number': '1005',
            'street': 'Gravenstein',
            'state': 'CA',
            'city': 'Sebastopol',
            'type': 'Hwy',
        }
        assert sa.parse("115 Broadway San Francisco CA") == {
            'number': '115',
            'street': 'Broadway',
            'state': 'CA',
            'city': 'San Francisco',
        }
        assert sa.parse("7800 Mill Station Rd, Sebastopol, CA 95472") == {
            'number': '7800',
            'street': 'Mill Station',
            'state': 'CA',
            'city': 'Sebastopol',
            'zip': '95472',
            'type': 'Rd',
        }
        assert sa.parse("7800 Mill Station Rd Sebastopol CA 95472") == {
            'number': '7800',
            'street': 'Mill Station',
            'state': 'CA',
            'city': 'Sebastopol',
            'zip': '95472',
            'type': 'Rd',
        }

        assert sa.parse("1005 State Highway 116 Sebastopol CA 95472") == {
            'number': '1005',
            'street': 'State Highway 116',
            'state': 'CA',
            'city': 'Sebastopol',
            'zip': '95472',
            'type': 'Hwy',
        }
        assert sa.parse("1600 Pennsylvania Ave. Washington DC") == {
            'number': '1600',
            'street': 'Pennsylvania',
            'state': 'DC',
            'city': 'Washington',
            'type': 'Ave',
        }
        assert sa.parse("1600 Pennsylvania Avenue Washington DC") == {
            'number': '1600',
            'street': 'Pennsylvania',
            'state': 'DC',
            'city': 'Washington',
            'type': 'Ave',
        }
        assert sa.parse("48S 400E, Salt Lake City UT") == {
            'type': '',
            'number': '48',
            'street': '400',
            'state': 'UT',
            'city': 'Salt Lake City',
            'suffix': 'E',
            'prefix': 'S'
        }
        assert sa.parse("550 S 400 E #3206, Salt Lake City UT 84111") == {
            'number': '550',
            'street': '400',
            'state': 'UT',
            'sec_unit_num': '3206',
            'zip': '84111',
            'city': 'Salt Lake City',
            'suffix': 'E',
            'type': '',
            'sec_unit_type': '#',
            'prefix': 'S'
        }
        assert sa.parse("6641 N 2200 W Apt D304 Park City, UT 84098") == {
            'number': '6641',
            'street': '2200',
            'state': 'UT',
            'sec_unit_num': 'D304',
            'zip': '84098',
            'city': 'Park City',
            'suffix': 'W',
            'type': '',
            'sec_unit_type': 'Apt',
            'prefix': 'N'
        }
        assert sa.parse("100 South St, Philadelphia, PA") == {
            'number': '100',
            'street': 'South',
            'state': 'PA',
            'city': 'Philadelphia',
            'type': 'St',
        }
        assert sa.parse("100 S.E. Washington Ave, Minneapolis, MN") == {
            'number': '100',
            'street': 'Washington',
            'state': 'MN',
            'city': 'Minneapolis',
            'type': 'Ave',
            'prefix': 'SE'
        }
        assert sa.parse("3813 1/2 Some Road, Los Angeles, CA") == {
            'number': '3813',
            'street': 'Some',
            'state': 'CA',
            'city': 'Los Angeles',
            'type': 'Rd',
        }
        assert sa.parse("Mission & Valencia San Francisco CA") == {
            'type1': '',
            'type2': '',
            'street1': 'Mission',
            'state': 'CA',
            'city': 'San Francisco',
            'street2': 'Valencia'
        }
        assert sa.parse("Mission & Valencia, San Francisco CA") == {
            'type1': '',
            'type2': '',
            'street1': 'Mission',
            'state': 'CA',
            'city': 'San Francisco',
            'street2': 'Valencia'
        }
        assert sa.parse("Mission St and Valencia St San Francisco CA") == {
            'type1': 'St',
            'type2': 'St',
            'street1': 'Mission',
            'state': 'CA',
            'city': 'San Francisco',
            'street2': 'Valencia'
        }
        assert sa.parse("Mission St & Valencia St San Francisco CA") == {
            'type1': 'St',
            'type2': 'St',
            'street1': 'Mission',
            'state': 'CA',
            'city': 'San Francisco',
            'street2': 'Valencia'
        }
        assert sa.parse("Mission and Valencia Sts San Francisco CA") == {
            'type1': 'St',
            'type2': 'St',
            'street1': 'Mission',
            'state': 'CA',
            'city': 'San Francisco',
            'street2': 'Valencia'
        }
        assert sa.parse("Mission & Valencia Sts. San Francisco CA") == {
            'type1': 'St',
            'type2': 'St',
            'street1': 'Mission',
            'state': 'CA',
            'city': 'San Francisco',
            'street2': 'Valencia'
        }
        assert sa.parse("Mission & Valencia Streets San Francisco CA") == {
            'type1': 'St',
            'type2': 'St',
            'street1': 'Mission',
            'state': 'CA',
            'city': 'San Francisco',
            'street2': 'Valencia'
        }
        assert sa.parse(
            "Mission Avenue and Valencia Street San Francisco CA") == {
                'type1': 'Ave',
                'type2': 'St',
                'street1': 'Mission',
                'state': 'CA',
                'city': 'San Francisco',
                'street2': 'Valencia'
            }
        assert sa.parse("1 First St, e San Jose CA") == { # lower case city direction
                  'number' : '1',
                  'street' : 'First',
                  'state' : 'CA',
                  'city' : 'East San Jose',
                  'type' : 'St',
                }
        assert sa.parse("123 Maple Rochester, New York") == { # space in state name
                  'type' : '',
                  'number' : '123',
                  'street' : 'Maple',
                  'state' : 'NY',
                  'city' : 'Rochester',
                }
        assert sa.parse("233 S Wacker Dr 60606-6306") == {  # zip+4 with hyphen
            'number': '233',
            'street': 'Wacker',
            'zip': '60606',
            'type': 'Dr',
            'prefix': 'S'
        }
        assert sa.parse(
            "233 S Wacker Dr 606066306") == {  # zip+4 without hyphen
                'number': '233',
                'street': 'Wacker',
                'zip': '60606',
                'type': 'Dr',
                'prefix': 'S'
            }
        assert sa.parse("233 S Wacker Dr lobby 60606") == { # unnumbered secondary unit type
                  'number' : '233',
                  'street' : 'Wacker',
                  'zip' : '60606',
                  'type' : 'Dr',
                  'prefix' : 'S',
                  'sec_unit_type' : 'lobby',
                }
        assert sa.parse("(233 S Wacker Dr lobby 60606)") == { # surrounding punctuation
                  'number' : '233',
                  'street' : 'Wacker',
                  'zip' : '60606',
                  'type' : 'Dr',
                  'prefix' : 'S',
                  'sec_unit_type' : 'lobby',
                }
        assert sa.parse("#42 233 S Wacker Dr 60606"
                        ) == {  # leading numbered secondary unit type
                            'sec_unit_num': '42',
                            'zip': '60606',
                            'number': '233',
                            'street': 'Wacker',
                            'sec_unit_type': '#',
                            'type': 'Dr',
                            'prefix': 'S'
                        }
        assert sa.parse("lt42 99 Some Road, Some City LA"
                        ) == {  # no space before sec_unit_num
                            'sec_unit_num': '42',
                            'city': 'Some City',
                            'number': '99',
                            'street': 'Some',
                            'sec_unit_type': 'lt',
                            'type': 'Rd',
                            'state': 'LA'
                        }
        assert sa.parse("36401 County Road 43, Eaton, CO 80615"
                        ) == {  # numbered County Road
                            'city': 'Eaton',
                            'zip': '80615',
                            'number': '36401',
                            'street': 'County Road 43',
                            'type': 'Rd',
                            'state': 'CO'
                        }
        assert sa.parse("1234 COUNTY HWY 60E, Town, CO 12345") == {
            'city': 'Town',
            'zip': '12345',
            'number': '1234',
            'street': 'COUNTY HWY 60',
            'suffix': 'E',
            'type': '',  # ?
            'state': 'CO'
        }
Пример #5
0
    def ReadSlowTxt(self, txtfile, Print=False):
        state = 0
        count = 0
        total_attempts = 3
        good_addr = ''
        prev_one_line = ''
        prev_two_line = ''
        f = open(txtfile, 'r')
        for line in iter(f):
            #if Print is True:
            #    print line
            prev_two_line = prev_one_line
            prev_one_line = line
            if count < 0:
                state = 0
            if len(line) < 200:
                print str(state) + '|' + line + '|' + good_addr
                if state == 0:
                    line = line.split(':')
                    for subline in line:
                        addr = sa.parse(subline)
                        #print addr
                        if (addr is not None) and \
                                ('number' in addr.keys()) and \
                                ('street' in addr.keys()) and \
                                len(addr['street'])<=34:
                            if ('city' not in addr.keys()) and \
                                    ('state' not in addr.keys()):
                                good_addr = subline.replace("\r\n", "")
                                #print 'good: '+good_addr
                                #print 'go to 1'
                                state = 1
                                count = total_attempts
                                break
                            if 'state' not in addr.keys():
                                good_addr = subline.replace("\r\n", "")
                                #print 'good: '+good_addr
                                #print 'go to 2'
                                state = 2
                            else:
                                return addr
                            break

                else:
                    line = line.split()
                    tags = self.st.tag(line)
                    subline = good_addr
                    #print tags
                    #sub = []
                    #sub.append(good_addr)
                    for tag in tags:
                        if tag[1] == 'LOCATION':
                            subline += ' ' + tag[0].replace("\r\n",
                                                            "").encode('ascii')
                            #sub.append(tag[0].replace("\r\n",""))
                    #subline=''.join(sub)
                    #print state, subline
                    if state == 1:
                        addr = sa.parse(subline)
                        #print state, addr
                        if (addr is not None) and \
                                ('number' in addr.keys()) and \
                                ('street' in addr.keys()) and \
                                ('city' in addr.keys()) and \
                                ('state' in addr.keys()) and \
                                len(addr['street'])<=34:
                            return addr
                        else:
                            state = 2
                            good_addr = subline
                            count -= 1
                            #print 'go to 2'
                            continue
                    if state == 2:
                        addr = sa.parse(subline)
                        #print state, addr
                        if (addr is not None) and \
                                ('number' in addr.keys()) and \
                                ('street' in addr.keys()) and \
                                ('city' in addr.keys()) and \
                                len(addr['street'])<=34:
                            return addr
                        else:
                            count -= 1
                            continue
Пример #6
0
    def test_all(self) :

    assert sa.parse('1005 Gravenstein Hwy 95472') == {'number' :'1005',
                'street' : 'Gravenstein',
                'zip' : '95472',
                'type' :'Hwy'
            }

    assert sa.parse('1005 Gravenstein Hwy, 95472') == {'number' :'1005',
                'street' : 'Gravenstein',
                'zip' : '95472',
                'type' :'Hwy'
            }

    assert sa.parse('1005 Gravenstein Hwy N, 95472') == {'number' :'1005',
                'street' : 'Gravenstein',
                'zip' : '95472',
                'type' :'Hwy',
                'suffix' : 'N'
            }

    assert sa.parse('1005 Gravenstein Highway North, 95472') == {'number' :'1005',
                'street' : 'Gravenstein',
                'zip' : '95472',
                'type' :'Hwy',
                'suffix' : 'N'
            }

    assert sa.parse('1005 N Gravenstein Highway, Sebastopol, CA') == {'number' :'1005',
                'street' : 'Gravenstein',

                'type' :'Hwy',
                'prefix' : 'N',
                'city' : 'Sebastopol',
                'state' : 'CA'
            }


    assert sa.parse("1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == {
                'number' : '1005',
                'street' : 'Gravenstein',
                'state' : 'CA',
                'city' : 'Sebastopol',
                'type' : 'Hwy',
                'prefix' : 'N',
                'sec_unit_type' : 'Suite',
                'sec_unit_num' : '500',
            }





    assert sa.parse("1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == {
                'number' : '1005',
                'street' : 'Gravenstein',
                'state' : 'CA',
                'city' : 'Sebastopol',
                'type' : 'Hwy',
                'prefix' : 'N',
                'sec_unit_type' : 'Suite',
                'sec_unit_num' : '500',
            }
    assert sa.parse("1005 N Gravenstein Hwy Suite 500 Sebastopol, CA") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'type' : 'Hwy',
              'prefix' : 'N',
              'sec_unit_type' : 'Suite',
              'sec_unit_num' : '500',
            }

    assert sa.parse("1005 N Gravenstein Highway, Sebastopol, CA, 95472") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'zip' : '95472',
              'type' : 'Hwy',
              'prefix' : 'N'
            }
    assert sa.parse("1005 N Gravenstein Highway Sebastopol CA 95472") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'zip' : '95472',
              'type' : 'Hwy',
              'prefix' : 'N'
            }
    assert sa.parse("1005 Gravenstein Hwy N Sebastopol CA") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'suffix' : 'N',
              'type' : 'Hwy',
            }
    assert sa.parse("1005 Gravenstein Hwy N, Sebastopol CA") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'suffix' : 'N',
              'type' : 'Hwy',
            }

    assert sa.parse("1005 Gravenstein Hwy, N Sebastopol CA") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'North Sebastopol',
              'type' : 'Hwy',
            }
    assert sa.parse("1005 Gravenstein Hwy, North Sebastopol CA") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'North Sebastopol',
              'type' : 'Hwy',
            }
    assert sa.parse("1005 Gravenstein Hwy Sebastopol CA") == {
              'number' : '1005',
              'street' : 'Gravenstein',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'type' : 'Hwy',
            }
    assert sa.parse("115 Broadway San Francisco CA") == {
              'number' : '115',
              'street' : 'Broadway',
              'state' : 'CA',
              'city' : 'San Francisco',
            }
    assert sa.parse("7800 Mill Station Rd, Sebastopol, CA 95472") == {
              'number' : '7800',
              'street' : 'Mill Station',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'zip' : '95472',
              'type' : 'Rd',
            }
    assert sa.parse("7800 Mill Station Rd Sebastopol CA 95472") == {
              'number' : '7800',
              'street' : 'Mill Station',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'zip' : '95472',
              'type' : 'Rd',
            }

    assert sa.parse("1005 State Highway 116 Sebastopol CA 95472") == {
              'number' : '1005',
              'street' : 'State Highway 116',
              'state' : 'CA',
              'city' : 'Sebastopol',
              'zip' : '95472',
              'type' : 'Hwy',
            }
    assert sa.parse("1600 Pennsylvania Ave. Washington DC") == {
              'number' : '1600',
              'street' : 'Pennsylvania',
              'state' : 'DC',
              'city' : 'Washington',
              'type' : 'Ave',
            }
    assert sa.parse("1600 Pennsylvania Avenue Washington DC") == {
              'number' : '1600',
              'street' : 'Pennsylvania',
              'state' : 'DC',
              'city' : 'Washington',
              'type' : 'Ave',
            }
    assert sa.parse("48S 400E, Salt Lake City UT") == {
              'type' : '',
              'number' : '48',
              'street' : '400',
              'state' : 'UT',
              'city' : 'Salt Lake City',
              'suffix' : 'E',
              'prefix' : 'S'
            }
    assert sa.parse("550 S 400 E #3206, Salt Lake City UT 84111") == {
                'number' : '550',
                'street' : '400',
                'state' : 'UT',
                'sec_unit_num' : '3206',
                'zip' : '84111',
                'city' : 'Salt Lake City',
                'suffix' : 'E',
                'type' : '',
                'sec_unit_type' : '#',
                'prefix' : 'S'
        }
    assert sa.parse("6641 N 2200 W Apt D304 Park City, UT 84098") == {
              'number' : '6641',
              'street' : '2200',
              'state' : 'UT',
              'sec_unit_num' : 'D304',
              'zip' : '84098',
              'city' : 'Park City',
              'suffix' : 'W',
              'type' : '',
              'sec_unit_type' : 'Apt',
              'prefix' : 'N'
        }
    assert sa.parse("100 South St, Philadelphia, PA") == {
              'number' : '100',
              'street' : 'South',
              'state' : 'PA',
              'city' : 'Philadelphia',
              'type' : 'St',
            }
    assert sa.parse("100 S.E. Washington Ave, Minneapolis, MN") == {
              'number' : '100',
              'street' : 'Washington',
              'state' : 'MN',
              'city' : 'Minneapolis',
              'type' : 'Ave',
              'prefix' : 'SE'
            }
    assert sa.parse("3813 1/2 Some Road, Los Angeles, CA") == {
              'number' : '3813',
              'street' : 'Some',
              'state' : 'CA',
              'city' : 'Los Angeles',
              'type' : 'Rd',
            }
    assert sa.parse("Mission & Valencia San Francisco CA") == {
              'type1' : '',
              'type2' : '',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("Mission & Valencia, San Francisco CA") == {
              'type1' : '',
              'type2' : '',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("Mission St and Valencia St San Francisco CA") == {
              'type1' : 'St',
              'type2' : 'St',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("Mission St & Valencia St San Francisco CA") == {
              'type1' : 'St',
              'type2' : 'St',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("Mission and Valencia Sts San Francisco CA") == {
              'type1' : 'St',
              'type2' : 'St',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("Mission & Valencia Sts. San Francisco CA") == {
              'type1' : 'St',
              'type2' : 'St',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("Mission & Valencia Streets San Francisco CA") == {
              'type1' : 'St',
              'type2' : 'St',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("Mission Avenue and Valencia Street San Francisco CA") == {
              'type1' : 'Ave',
              'type2' : 'St',
              'street1' : 'Mission',
              'state' : 'CA',
              'city' : 'San Francisco',
              'street2' : 'Valencia'
            }
    assert sa.parse("1 First St, e San Jose CA") == { # lower case city direction
              'number' : '1',
              'street' : 'First',
              'state' : 'CA',
              'city' : 'East San Jose',
              'type' : 'St',
            }
    assert sa.parse("123 Maple Rochester, New York") == { # space in state name
              'type' : '',
              'number' : '123',
              'street' : 'Maple',
              'state' : 'NY',
              'city' : 'Rochester',
            }
    assert sa.parse("233 S Wacker Dr 60606-6306") == { # zip+4 with hyphen
              'number' : '233',
              'street' : 'Wacker',
              'zip' : '60606',
              'type' : 'Dr',
              'prefix' : 'S'
            }
    assert sa.parse("233 S Wacker Dr 606066306") == { # zip+4 without hyphen
              'number' : '233',
              'street' : 'Wacker',
              'zip' : '60606',
              'type' : 'Dr',
              'prefix' : 'S'
            }
    assert sa.parse("233 S Wacker Dr lobby 60606") == { # unnumbered secondary unit type
              'number' : '233',
              'street' : 'Wacker',
              'zip' : '60606',
              'type' : 'Dr',
              'prefix' : 'S',
              'sec_unit_type' : 'lobby',
            }
    assert sa.parse("(233 S Wacker Dr lobby 60606)") == { # surrounding punctuation
              'number' : '233',
              'street' : 'Wacker',
              'zip' : '60606',
              'type' : 'Dr',
              'prefix' : 'S',
              'sec_unit_type' : 'lobby',
            }
    assert sa.parse("#42 233 S Wacker Dr 60606") == { # leading numbered secondary unit type
              'sec_unit_num' : '42',
              'zip' : '60606',
              'number' : '233',
              'street' : 'Wacker',
              'sec_unit_type' : '#',
              'type' : 'Dr',
              'prefix' : 'S'
            }
    assert sa.parse("lt42 99 Some Road, Some City LA") == { # no space before sec_unit_num
              'sec_unit_num' : '42',
              'city' : 'Some City',
              'number' : '99',
              'street' : 'Some',
              'sec_unit_type' : 'lt',
              'type' : 'Rd',
              'state' : 'LA'
            }
    assert sa.parse("36401 County Road 43, Eaton, CO 80615") == { # numbered County Road
              'city' : 'Eaton',
              'zip' : '80615',
              'number' : '36401',
              'street' : 'County Road 43',
              'type' : 'Rd',
              'state' : 'CO'
            }
    assert sa.parse("1234 COUNTY HWY 60E, Town, CO 12345") == {
            'city' : 'Town',
            'zip' : '12345',
            'number' : '1234',
            'street' : 'COUNTY HWY 60',
            'suffix' : 'E',
            'type' : '',  # ?
            'state' : 'CO'
            }
Пример #7
0
    def execute(self, search_input, user_agent):
        # Google maps geolocation appends 'USA' but the address parser can't cope
        search_input = search_input.replace('USA','')
        addr = streetaddress.parse(search_input)
        if addr is None:
            # Since we are so tightly coupled with Appleton data, let's just pacify the address parser
            addr = streetaddress.parse(search_input + ' Appleton, WI')
        housenumber = addr['number']
        # Handle upstream requirement of "Fifth" not "5th"
        p = inflect.engine()
        if contains_digits(addr['street']):
            street = p.number_to_words(addr['street'])
        else:
            street = addr['street']

        if not housenumber and not street:
            return { 'error' : 'Give me *SOMETHING* to search for.'}

        try:
            response = urllib2.urlopen('http://my.appleton.org/')
            for line in response:
                if "__VIEWSTATE\"" in line:
                    vs = extracttagvalues(line)
                if "__EVENTVALIDATION\"" in line:
                    ev = extracttagvalues(line)
                    formvalues = {
                        '__EVENTTARGET': '',
                        '__EVENTARGUMENT': '',
                        '__VIEWSTATE': vs,
                        '__EVENTVALIDATION': ev,
                        'ctl00$myappletonContent$txtStreetNumber': housenumber,
                        'ctl00$myappletonContent$txtStreetName': street,
                        'ctl00$myappletonContent$btnSubmit': 'Submit'}
                    headers = {
                        'User-Agent': user_agent,
                        'Referer': 'http://my.appleton.org/default.aspx',
                        'Accept': 'text/html,application/xhtml+xml,application/xml'
                    }
                    data = urllib.urlencode(formvalues)
                    req = urllib2.Request("http://my.appleton.org/default.aspx", data, headers)
                    response = urllib2.urlopen(req)
                    allresults = []
                    # Example of the HTML returned...
                    # <a id="ctl00_myappletonContent_searchResults_ctl03_PropKey"
                    # href="Propdetail.aspx?PropKey=312039300&amp;Num=100">312039300  </a>
                    #                  </td><td>100</td><td>E WASHINGTON           ST  </td>
                    for pline in response:
                        if "Propdetail.aspx?PropKey=" in pline:
                            searchresult = []
                            m = re.search('(?<=PropKey\=).*(?=&)', pline)
                            if m:
                                searchresult.append(re.split('PropKey=', m.group(0))[0])
                            m = re.findall('(?s)<td>(.*?)</td>', response.next())
                            if m:
                                # this removes whitespace and Title Cases the address
                                # given: <td>1200</td><td>W WISCONSIN    AVE </td>
                                # returns: ['1200', 'W Wisconsin Ave']
                                address = [' '.join(t.split()).strip().title() for t in m]
                                searchresult.append(address[0]) #Number
                                # Thank you Dan Gabrielson <*****@*****.**> and Matt Everson https://github.com/matteverson
                                # for your help at 2015 Appleton Civic Hackathon! This closes https://github.com/mikeputnam/appletonapi/issues/5
                                label = ' '
                                for chunk in address[1:]:
                                    label += chunk + ' '
                                searchresult.append(label.strip())
                            allresults.append(searchresult)

            return { 'result' : allresults }
        except urllib2.URLError, e:
            logging.error('SEARCH FAIL! my.appleton.org up? scrape assumptions still valid?')
            return { 'error' : "Cannot search :( <br/>" + str(e) }
Пример #8
0
 def test( self ):
     #self.assertEqual( sa.parse( addr ), addr_parsed )
     addr_parsed = sa.parse( addr )
     assert addr_parsed, 'Could not parse address "{}"'.format(addr)
     assert addr_parsed == addr_parsed_validator, ( addr + "\n" + str( diff( addr_parsed, addr_parsed_validator ) ) )
Пример #9
0
    def execute(self, search_input, user_agent):
        # Google maps geolocation appends 'USA' but the address parser can't cope
        search_input = search_input.replace('USA', '')
        addr = streetaddress.parse(search_input)
        if addr is None:
            # Since we are so tightly coupled with Appleton data, let's just pacify the address parser
            addr = streetaddress.parse(search_input + ' Appleton, WI')
        housenumber = addr['number']
        # Handle upstream requirement of "Fifth" not "5th"
        p = inflect.engine()
        if contains_digits(addr['street']):
            street = p.number_to_words(addr['street'])
        else:
            street = addr['street']

        if not housenumber and not street:
            return {'error': 'Give me *SOMETHING* to search for.'}

        try:
            response = urllib2.urlopen('http://my.appleton.org/')
            for line in response:
                if "__VIEWSTATE\"" in line:
                    vs = extracttagvalues(line)
                if "__EVENTVALIDATION\"" in line:
                    ev = extracttagvalues(line)
                    formvalues = {
                        '__EVENTTARGET': '',
                        '__EVENTARGUMENT': '',
                        '__VIEWSTATE': vs,
                        '__EVENTVALIDATION': ev,
                        'ctl00$myappletonContent$txtStreetNumber': housenumber,
                        'ctl00$myappletonContent$txtStreetName': street,
                        'ctl00$myappletonContent$btnSubmit': 'Submit'
                    }
                    headers = {
                        'User-Agent': user_agent,
                        'Referer': 'http://my.appleton.org/default.aspx',
                        'Accept':
                        'text/html,application/xhtml+xml,application/xml'
                    }
                    data = urllib.urlencode(formvalues)
                    req = urllib2.Request(
                        "http://my.appleton.org/default.aspx", data, headers)
                    response = urllib2.urlopen(req)
                    allresults = []
                    # Example of the HTML returned...
                    # <a id="ctl00_myappletonContent_searchResults_ctl03_PropKey"
                    # href="Propdetail.aspx?PropKey=312039300&amp;Num=100">312039300  </a>
                    #                  </td><td>100</td><td>E WASHINGTON           ST  </td>
                    for pline in response:
                        if "Propdetail.aspx?PropKey=" in pline:
                            searchresult = []
                            m = re.search('(?<=PropKey\=).*(?=&)', pline)
                            if m:
                                searchresult.append(
                                    re.split('PropKey=', m.group(0))[0])
                            m = re.findall('(?s)<td>(.*?)</td>',
                                           response.next())
                            if m:
                                # this removes whitespace and Title Cases the address
                                # given: <td>1200</td><td>W WISCONSIN    AVE </td>
                                # returns: ['1200', 'W Wisconsin Ave']
                                address = [
                                    ' '.join(t.split()).strip().title()
                                    for t in m
                                ]
                                searchresult.append(address[0])  #Number
                                # Thank you Dan Gabrielson <*****@*****.**> and Matt Everson https://github.com/matteverson
                                # for your help at 2015 Appleton Civic Hackathon! This closes https://github.com/mikeputnam/appletonapi/issues/5
                                label = ' '
                                for chunk in address[1:]:
                                    label += chunk + ' '
                                searchresult.append(label.strip())
                            allresults.append(searchresult)

            return {'result': allresults}
        except urllib2.URLError, e:
            logging.error(
                'SEARCH FAIL! my.appleton.org up? scrape assumptions still valid?'
            )
            return {'error': "Cannot search :( <br/>" + str(e)}