Exemplo n.º 1
0
 def PreProcessing_records(dt):
    '''
    return corect formatt
    
    ''' 
    dt.Address = dt.Address.apply(lambda x: str(x).upper())    # uppercase
    dt.Address = dt.Address.str.rstrip()     # Remove spaces 
    
    dt_Address_ar = np.asarray(dt.Address)
    vfunc_TH = np.vectorize(USAddressFormatter_TH)        # Custome formatter, correct wrong combination of number + TH
    dt_Address = vfunc_TH(dt_Address_ar)
    dt_Address_list = dt_Address.tolist()
    dt['Address'] = dt_Address_list
    dt['Address'] = dt.Address.str.rstrip()

    dt_Address_ar2 = np.asarray(dt.Address)    
    vfunc_Abbr = np.vectorize(USAddressFormatter_Abbr)    # Custom formatter, replace wrong abbreviation
    dt_Address2 = vfunc_Abbr(dt_Address_ar2)
    dt_Address2_list = dt_Address2.tolist()
    dt['Address'] = dt_Address2_list
    dt['Address'] = dt.Address.str.rstrip()
    
    dt_Address_ar3 = np.asarray(dt.Address)
    vfunc_Sym = np.vectorize(USAddressFormatter_Symbol)   # Customer formatter, remove useless symbols
    dt_Address3 = vfunc_Sym(dt_Address_ar3)
    dt_Address3_list = dt_Address3.tolist()
    dt['Address'] = dt_Address3_list
    dt['Address'] = dt.Address.str.rstrip()
    
    abbr_formatter = StreetAddressFormatter()   # load python package 'Formatter'
    for item in dt.Address:
        item = abbr_formatter.abbrev_direction(item)
        item = abbr_formatter.abbrev_street_avenue_etc(item)
    
    dt.Address = dt.Address.apply(lambda x: str(x).upper())  # uppercase again
    
    return dt
    
    
    
    
    
    
    
    boundary_list = []
    addr_formatter = StreetAddressFormatter()

    if opts.addr:
        lst = [opts.addr]
    else:
        lst = map(str.strip, tests)

    for t in lst:
        if t:
            print '"%s"' % t
            logging.info('addr_str: ' + unicode(t))
            addr = addr_parser.parse(t)

            if addr['street_full'] is not None:
                street = addr_formatter.append_TH_to_street(
                    addr['street_full'])
                logging.info('After append_TH_to_street: ' + street)

                street = addr_formatter.abbrev_direction(street)
                logging.info('After abbrev_direction: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(street)
                logging.info('After abbrev_street_avenue_etc: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(
                    street, abbrev_only_last_token=False)
                logging.info('After abbrev_street_avenue_etc (aggressive): ' +
                             street)

            print json.dumps(addr, sort_keys=True)
Exemplo n.º 3
0
    addr_parser = StreetAddressParser()
    addr_formatter = StreetAddressFormatter()

    if opts.addr:
        lst = [opts.addr]
    else:
        lst = map(str.strip,tests)

    for t in lst:
        if t:
            print '"%s"' % t
            logging.info('addr_str: ' + unicode(t))
            addr = addr_parser.parse(t)

            if addr['street_full'] is not None:
                street = addr_formatter.append_TH_to_street(addr['street_full'])
                logging.info('After append_TH_to_street: ' + street)

                street = addr_formatter.abbrev_direction(street)
                logging.info('After abbrev_direction: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(street)
                logging.info('After abbrev_street_avenue_etc: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(street, abbrev_only_last_token=False)
                logging.info('After abbrev_street_avenue_etc (aggressive): ' + street)

            print json.dumps(addr, sort_keys=True)