def tree_click(event): print ('click') # get the address for item in tree_csv.selection(): item_text = tree_csv.item(item,"values") print(item_text[0]) addr = item_text[1] # clear the expand treeview for x in tree_expand.get_children(): tree_expand.delete(x) # clear the parse treeview for x in tree_parse.get_children(): tree_parse.delete(x) # expand the address string list_addr = expand_address(addr) # insert the address expanded to expand treeview for addr in list_addr: print('type of addr: {}'.format(type(addr))) addr_str = '\"' + addr + '\"' tree_expand.insert('',0,text='Expand',values=addr_str) # parse the address addr_parse = parse_address(addr) # conversion the results to dict, and reverse the value-key order dict_addr = dict((x[1], x[0]) for x in addr_parse[0:]) # insert the parsed results to parse treeview for item in dict_addr.items(): tree_parse.insert('',0,values=list(item)) return addr # useless now
def parse(): body = request.get_json() input_str = body['request'] parsed = parse_address(input_str) #parsed = [item[0].upper() for item in parsed] body['result'] = parsed return json.dumps(body)
def find_house_number(street, house_number): if house_number != "" and not pd.isnull(house_number): return house_number lpost = parse_address(street) lpost = {x: y for (y, x) in lpost} return lpost["house_number"] if "house_number" in lpost else np.NaN
def format_address(raw_address: str) -> Dict[str, str]: """ Calls pypostal NLP library to parse street and housenumber data from a given address string: https://github.com/openvenues/pypostal format_address('Calle 39 No 1540') -> expected_address = {"street": "Calle 39", "housenumber": "No 1540"} """ if not raw_address: raise ValueError(f"Cannot extract address data from {raw_address}") try: parsed_address = parse_address(raw_address) house_number = [ item[0] for item in parsed_address if item[1] == "house_number" ][0] road = [item[0] for item in parsed_address if item[1] == "road"][0] formatted_address = { "street": fix_capitalization(road, raw_address), "housenumber": fix_capitalization(house_number, raw_address), } except IndexError as ex: message = f"Unable to parse street and house number data from input `{raw_address}` (parsed to `{parsed_address}`)" raise ValueError(message) return formatted_address
def _parse_standardised_address( df: pd.DataFrame, target: str, result: str, ) -> pd.DataFrame: df[result] = df.copy()[target].apply(lambda cell: parse_address(cell)) return df
def post(self, address: str) -> Tuple[Dict, int]: """Parse an address string Returns: A tuple of dict with message and results, and a status code. Result contains street address, city, zip """ parsed = parse_address(address) return get_address(parsed), 200
def parse(): body = request.get_json() input_str = body['request'] parsed = parse_address(input_str) result = {} for item in parsed: result[item[1]] = item[0] return result
def address_parser(user_input): nlp = spacy.load('nl_core_news_sm') doc = nlp(user_input) sample_list = [] for entity in doc.ents: sample_list.append(entity.text) text_lib = ' '.join(sample_list) return jsonify(parse_address(text_lib))
def post(self): args = addressText.parse_args() fullAddressText = args['addressText'] resp = parse_address(fullAddressText) finalResponse = dict((y, x) for x, y in resp) # finalResponse = {} # for loopera in resp: # key = loopera[1] # value = loopera[0] # finalResponse[key] = value return jsonify(finalResponse)
def handler(event, context): parse_output = parse_address(event['address']) result = dict() for i in parse_output: result[i[1]] = i[0] print("Result:") print(result) return result
def footer_text(string): ''' input: any stirng contaiing address like data output: address dictionary ''' rx = r'''(\b(Tel|Fax)\s*.*\d+|(\bE-?[mM]ail.*de)|([iI]nternet:?\s?[hH]ttps?:.*.de)|([Gg]esch[aä]ftsführer.+\S\s))''' s = re.sub(rx, '', string, re.UNICODE|re.MULTILINE) d = dict(map(reversed,parse_address(s, language='de', country='germany'))) #n_d = dict([(value, key) for key, value in d.items()]) return d
def contains_components(self, address, components): """Test whether address parse contains specific components.""" expected = len(components) got = 0 parsed = parse_address(address) self.assertTrue(parsed) for s, c in parsed: if components.get(c, None) == s: got += 1 self.assertEqual(expected, got)
def format_parser(add): # libpostal returns a list of tuples, this just converts it to a dictionary A = parse_address(add) B = dict((x, y) for (y, x) in A) key_list = ['house_number', 'road', 'city', 'state', 'postcode', 'unit'] parsed = [] for k in key_list: if k in B.keys(): parsed.append(B[k]) else: parsed.append('') return parsed
def model_run(dn_cmd,pdf_path:bytes) ->List: ''' input= darkent model command\ that will be fed into RecogPipe class pdf path has two effect one feeding into RecongPipe class in Darkent Command\ second feeding the image to RecogPipe method 'ocr' as argument (image path to crop) output= json dictinary that append to existing json dictionary ''' #supper_dict=defaultdict(list) supper_dict= [] for index,pg in enumerate(get_pdf(pdf_path)): #pg= wi(image=pg) cls=RcogPipe(dn_cmd,pg) j_dict=cls.ocr(pg) page={'PageNo:':str(index)} m1={**page,**j_dict} if 'supplier:' in m1.keys(): d=dict(parse_address(m1.get('supplier:')[0]))# seggregate the address. d=dict([(value, key) for key, value in d.items()])# correctng the key value pair order m1['supplier:']=d if 'footer:' in m1.keys(): ft = dict(parse_address(m1.get('footer:')[0])) ft = dict([(value, key) for key, value in ft.items()]) m1['footer:'] = ft os.rename('./predictions.jpg', './predictions' + str(index) + '.jpg') supper_dict.append(m1) # supper_dict['pageNo'].append(index) # for k,v in dict.items(): # supper_dict[k].append(v) # print('combining page data into json') #json_dic=json.dumps(supper_dict) print('Data from all PDF pages has been extracted successfully!') #pprint(supper_dict) return supper_dict
def enrich_item_with_variants(item): label = item['label'] # Acronyms for (acro, variant) in extractAcronymsByColocation(label): item['variants'].add(variant) item['acros'].add(acro) # Addresses (French or foreign) addr = parse_address(label) features = dict((f, v) for (v, f) in addr) if len(REQUIRED_ADDR_FEATURES | features.keys()) > 0: item['address_as_label'] = label if 'city' in features: item['city'] = features['city'] if 'country' in features: item['country'] = features['country'] # Unité Mixte de Recherche and such things for (kind, regex) in UR_REGEXES_LABEL.items(): ms = re.findall(regex, label) if ms: for m in ms: variant = regex_variant(kind, m) logging.info('Found UMR-type match: {} in label "{}"'.format( variant, label)) item['variants'].add(variant) item['ur_id'] = variant if 'url' in item: url = item['url'] for (kind, regex) in UR_REGEXES_URL.items(): ms = re.findall(regex, url) if ms: for m in ms: variant = regex_variant(m) logging.info('Found UMR-type match: {} in URL {}'.format( variant, url)) item['variants'].add(variant) item['ur_id'] = variant # Categorization for token in item['tokens']: cat = categorize(token) if cat is not None: item['categories'].add(cat) # Duplicated tokens i = label.find('') if i > 0: pre = justCase(label[:i]) post = justCase(label[i + 1]) if post.startswitch(pre): item['variants'].add(post)
def expand_click(event): print('focus: {}'.format(tree_csv.focus())) # clear the parse treeview for x in tree_parse.get_children(): tree_parse.delete(x) # get the address string from expand treeview item_text = tree_expand.item(tree_expand.selection(),"values")[0] # parse the address addr = parse_address(item_text) # conversion the results to dict, and reverse the value-key order dict_addr = dict((x[1], x[0]) for x in addr[0:]) # insert the parsed results to parse treeview for item in dict_addr.items(): tree_parse.insert('',0,values=list(item))
def normalize_addr(entry): addr_to_parse = entry['exp_addr'] x = parse_address( addr_to_parse, # adapt address parsing to known language and country of known # for improvided parsing language='en', country='us') for val_combo in x: column_val = val_combo[0] column_name = val_combo[1] full_col_name = 'pypost_' + column_name entry[full_col_name] = column_val return entry
def segment_address(address): """ Segment the address string into its components using the libpostal library and store it in a dictionary format. """ address_dict = {} address_list = parse_address(address) for value, key in address_list: capitalised_value = '' for item in value.split(): capitalised_value += item.capitalize() + ' ' address_dict[key] = capitalised_value[:-1] formatted_address = '' for key, value in address_dict.items(): formatted_address += value + ' ' address_dict['formatted_address'] = formatted_address[:-1] return address_dict
def focus_to(event): """method for select row by enter node_id""" tex = var01.get() # get node id from entry box # id in the tree or not if df_process.index.contains(tex): tree_idx = df_process.loc[tex,'ids'] # get selected index id from Dataframe tree_csv.selection_set(tree_idx) # highlight row which selected tree_csv.see(tree_idx) # make the row selected visiable # get the address for item in tree_csv.selection(): item_text = tree_csv.item(item,"values") print(item_text[0]) addr = item_text[1] # clear the expand treeview for x in tree_expand.get_children(): tree_expand.delete(x) # clear the parse treeview for x in tree_parse.get_children(): tree_parse.delete(x) # expand the address string list_addr = expand_address(addr) # insert the address expanded to expand treeview for addr in list_addr: print('type of addr: {}'.format(type(addr))) addr_str = '\"' + addr + '\"' tree_expand.insert('',0,text='Expand',values=addr_str) # parse the address addr_parse = parse_address(addr) # conversion the results to dict, and reverse the value-key order dict_addr = dict((x[1], x[0]) for x in addr_parse[0:]) # insert the parsed results to parse treeview for item in dict_addr.items(): tree_parse.insert('',0,values=list(item)) else: print('id wrong.') messagebox.showerror('Error','The ID which your enter not in the list.')
def addressParser(inputAddress): parsedAddress = parse_address(inputAddress) print(parsedAddress) try: try: # extract the first house_number value parsedHouseNumber = [element for element in parsedAddress if 'house_number' in element[1]][0][0] except IndexError: parsedHouseNumber = '' houseNumber = re.search( parsedHouseNumber, inputAddress, flags=re.IGNORECASE).group(0) streetArray = [element for element in parsedAddress if 'house_number' not in element[1]] parsedStreet = ' '.join([str(x[0]) for x in streetArray]) # substitute houseNumber with empty in the input string, # igore leading and trailing space and specific chracter street = re.sub( houseNumber, '', inputAddress, flags=re.IGNORECASE ).rstrip( '}{[]()?@$%^*<>/\\\"\'~;:-_, ' ).lstrip( '}{[]()?@$%^*<>/\\\"\'~;:-_,. ' ).replace( ' ', ' ' ).replace( ' ,', ',' ) addressDict = OrderedDict([("street", street), ("housenumber", houseNumber)]) # retnrn json object with orderedDict return json.dumps(addressDict, ensure_ascii=False) except Exception as e: print(e)
def getaddress(text): match = re.search( r'\A(.*?)\s+(\d+[a-zA-Z]{0,1}\s{0,1}[-]{1}\s{0,1}\d*[a-zA-Z]{0,1}|\d+[a-zA-Z-]{0,1}\d*[a-zA-Z]{0,1})', text) if match is not None: return match address = parse_address(text) for i in range(len(address)): if address[i][1] == "road": # print "Road " + address[i][0] UserAddress = address[i][0] UserAddress.ljust(2) if address[i][1] == "house_number": if (address[i][0]).isdigit(): # print "House number" + address[i][0] if UserAddress is not None: UserAddress = UserAddress + " " + address[i][0] return UserAddress
def parse(address: str) -> tuple[str]: try: if not isinstance(address, str): raise TypeError("`address` is not a string.") parts = parser.parse_address(address) parts = {k: v.lower() for v, k in parts} house_number, street = parts["house_number"], parts["road"] # match input case words = address.translate(str.maketrans("", "", string.punctuation)).split() for word in words: lower = word.lower() street = street.replace(lower, word) house_number = house_number.replace(lower, word) return {"street": street, "housenumber": house_number} except (KeyError, TypeError, ValueError) as e: logger.warning(e) raise ParseError(f"Unable to parse address: {address}")
def label_values(self, values, regions): """ The osm names require a set of potential admin_level 8 regions. These could be gathered from other columns, metadata, or other parts of the strings within the same column. :param values: set of string values, e.g., from a CSV column :param regions: set of geonames IDs (not URLs). Only osm names within these regions will be considered :return: """ places = defaultdict(list) roads = defaultdict(list) val_count = 0. for i, value in enumerate(values): if value.strip(): val_count += 1 addr = parse_address(value) for parsed in addr: if parsed[1] in ROAD: roads[parsed[0]].append(i) if parsed[1] in PLACE: places[parsed[0]].append(i) # TODO: min number of potential roads in column labelled_roads = self.find_osm_names(roads, regions) match = 0. labelled_v = ['' for _ in range(len(values))] for r in labelled_roads: for i in roads[r]: l = labelled_roads[r] labelled_v[i] = l match += 1 confidence = match / val_count if val_count > 0 else 0. return labelled_v, confidence
def handle(self, *args, **options): self.stdout.write("Loaded location server", ending='\n') main_socks, read_socks, write_socks = socket_bind('', 50006) while True: readable, writeable, exceptions = select(read_socks, write_socks, []) for sockobj in readable: if sockobj in main_socks: new_sock, address = sockobj.accept() print('Connect:', address, id(new_sock)) read_socks.append(new_sock) else: try: data = recv_end(sockobj) if not data: sockobj.close() read_socks.remove(sockobj) else: new_data = parse_address(data) sockobj.sendall( json.dumps(new_data).encode('utf8') + '--end--'.encode('utf8')) except: pass
def get_store_name_from_text(text): address = parse_address(text) for item, kind in address: if kind == "house": return item
Uses an existing solution for the demonstration. For real usage, one should train a model using for example AddressBase data for the best performance. One should also consider using different labels (tokens). Requirements ------------ :requires: libpostal (https://github.com/openvenues/libpostal) Author ------ :author: Sami Niemi ([email protected]) Version ------- :version: 0.1 :date: 29-Sep-2016 """ from postal.parser import parse_address if __name__ == "__main__": print('Example, parsing 6 PROSPECT GARDENS EXETER EX4 6BA:') print(parse_address('6 PROSPECT GARDENS EXETER EX4 6BA')) var = input('\nInput a string:') print(parse_address(var))
def post(self, address: str) -> Tuple[Dict, int]: """Parse an address string with libpostal without any processing """ return parse_address(address), 200
def address_parser(row, colname): return parse_address(str(row[colname]))
matches = datefinder.find_dates( string, index=True) # parsing dates, return date index #%% if dates are found in texts, extract dates if list(datefinder.find_dates(string, index=True)): for match in matches: date = match[0] date_str = date.strftime( '%Y-%m-%d') #set format for dates (e.g., 2011-01-01) df.loc[i]['date_iso'] = date_str ranking_Y = ranking_m = ranking_d = '1' #%% then use the remaining string to parse addresses letters = string[:match[1][ 0]] # extract address from text using date index Parsed_address = parse_address( letters ) # parsing addresses, output example: (('Broadway','Road'),('NYC','City')) parse_address_details( Parsed_address ) #split parsed addresses into road/city/country #%% if dates are not found in text, use the whole string to parse address else: letters = string Parsed_address = parse_address( letters ) # parsing addresses, output example: (('Broadway','Road'),('NYC','City')) parse_address_details( Parsed_address ) #split parsed addresses into road/city/country
def main(): print_with_timestamp("Starting run.") inputfile = "../Hennepin County Moving Violations 2010-2015.csv" interstates = ["494", "694", "394", "35w", "94"] us_highways = ["12", "169", "212"] state_routes = ["62", "77", "100", "101", "610"] directions = ["nb", "northbound", "eb", "eastbound", "sb", "southbound", "wb", "westbound"] outputfile = inputfile[:-4] + "_regularised_addresses.csv" with open(outputfile, 'w') as f_out: with open(inputfile, 'rU') as f_in: reader = csv.DictReader(f_in, dialect="excel") writer = csv.DictWriter(f_out, reader.fieldnames + ["parsed_address", "parsed_w_city", "parsed_w_state", "parsed_w_county_and_state"], encoding="utf8") writer.writeheader() n = 0 for row in reader: offloctn = row["offloctn"].lower().replace("n/b", "").replace("e/b", "").replace("s/b", "").replace("w/b", "").replace("nb ", "").replace("eb ", "").replace("sb ", "").replace("wb ", "").replace(" nb", "").replace(" eb", "").replace(" sb", "").replace(" wb", "").replace("(", "").replace(")", "").replace("/", "&").replace("@", "&").replace(" from ", "&").replace(" at ", "&").replace("&&", "&") # print offloctn addr = "" for loc in offloctn.split("&"): if addr != "": addr = addr + " & " found = False for interstate in interstates: if loc.strip() == interstate: addr += "I-" + interstate # print loc, "||", row["offloctn"], "||", addr found = True break if not found: for hwy in us_highways: if loc.strip() == hwy: addr += "US-" + hwy # print loc, "||", row["offloctn"], "||", addr found = True break if not found: for rte in state_routes: if loc.strip() == rte: addr += "SR-" + rte # print loc, "||", row["offloctn"], "||", addr found = True break if not found: newpart = "" for token in parse_address(loc + ", Hennepin County, Minnesota, USA"): if token[0] not in (u'hennepin county', u'minnesota', u'usa'): if newpart != "": if token[1] in (u'suburb', u'city'): newpart += ", " else: newpart += " " newpart += token[0] addr = addr + newpart # print parse_address(loc + ", Hennepin County, Minnesota, USA") # print addr addr = addr.replace("cr ", "County Road").replace("co rd ", "County Road") row["parsed_address"] = addr row["parsed_w_city"] = addr + "Minneapolis, Minnesota, USA" row["parsed_w_state"] = addr + ", Minnesota, USA" row["parsed_w_county_and_state"] = addr + ", Hennepin County, Minnesota, USA" # print row["parsed_address"] writer.writerow(row) n = n + 1 if n % 10000 == 0: print_with_timestamp("Wrote " + str(n) + " rows so far.") print_with_timestamp("Run complete.")
import csv from postal.parser import parse_address file = "../data/ParkverstoesseBonn2017OpenData_raw.csv" i = 0 with open(file, encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter=';', quotechar='"') for row in reader: i += 1 address = row[2].replace("Bonn, ", "").replace("Bonn , ", "").replace( "-", " ").replace("gegenüber Hnr", "") print(address) print(parse_address(address)) if i == 1000: break