def PreProcessing_records(dt): ''' return corect formatt ''' dt.Address = dt.Address.apply(lambda x: str(x).upper()) # uppercase dt.Address = dt.Address.str.rstrip() # Remove spaces dt_Address_ar = np.asarray(dt.Address) vfunc_TH = np.vectorize(USAddressFormatter_TH) # Custome formatter, correct wrong combination of number + TH dt_Address = vfunc_TH(dt_Address_ar) dt_Address_list = dt_Address.tolist() dt['Address'] = dt_Address_list dt['Address'] = dt.Address.str.rstrip() dt_Address_ar2 = np.asarray(dt.Address) vfunc_Abbr = np.vectorize(USAddressFormatter_Abbr) # Custom formatter, replace wrong abbreviation dt_Address2 = vfunc_Abbr(dt_Address_ar2) dt_Address2_list = dt_Address2.tolist() dt['Address'] = dt_Address2_list dt['Address'] = dt.Address.str.rstrip() dt_Address_ar3 = np.asarray(dt.Address) vfunc_Sym = np.vectorize(USAddressFormatter_Symbol) # Customer formatter, remove useless symbols dt_Address3 = vfunc_Sym(dt_Address_ar3) dt_Address3_list = dt_Address3.tolist() dt['Address'] = dt_Address3_list dt['Address'] = dt.Address.str.rstrip() abbr_formatter = StreetAddressFormatter() # load python package 'Formatter' for item in dt.Address: item = abbr_formatter.abbrev_direction(item) item = abbr_formatter.abbrev_street_avenue_etc(item) dt.Address = dt.Address.apply(lambda x: str(x).upper()) # uppercase again return dt boundary_list = []
addr_formatter = StreetAddressFormatter() if opts.addr: lst = [opts.addr] else: lst = map(str.strip, tests) for t in lst: if t: print '"%s"' % t logging.info('addr_str: ' + unicode(t)) addr = addr_parser.parse(t) if addr['street_full'] is not None: street = addr_formatter.append_TH_to_street( addr['street_full']) logging.info('After append_TH_to_street: ' + street) street = addr_formatter.abbrev_direction(street) logging.info('After abbrev_direction: ' + street) street = addr_formatter.abbrev_street_avenue_etc(street) logging.info('After abbrev_street_avenue_etc: ' + street) street = addr_formatter.abbrev_street_avenue_etc( street, abbrev_only_last_token=False) logging.info('After abbrev_street_avenue_etc (aggressive): ' + street) print json.dumps(addr, sort_keys=True)
addr_parser = StreetAddressParser() addr_formatter = StreetAddressFormatter() if opts.addr: lst = [opts.addr] else: lst = map(str.strip,tests) for t in lst: if t: print '"%s"' % t logging.info('addr_str: ' + unicode(t)) addr = addr_parser.parse(t) if addr['street_full'] is not None: street = addr_formatter.append_TH_to_street(addr['street_full']) logging.info('After append_TH_to_street: ' + street) street = addr_formatter.abbrev_direction(street) logging.info('After abbrev_direction: ' + street) street = addr_formatter.abbrev_street_avenue_etc(street) logging.info('After abbrev_street_avenue_etc: ' + street) street = addr_formatter.abbrev_street_avenue_etc(street, abbrev_only_last_token=False) logging.info('After abbrev_street_avenue_etc (aggressive): ' + street) print json.dumps(addr, sort_keys=True)