-
Notifications
You must be signed in to change notification settings - Fork 0
/
LV_all_code.py
593 lines (481 loc) · 23.7 KB
/
LV_all_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
# This is all the relevant code from my .ipynb file:
# https://github.com/martin-martin/cleaning-las-vegas/blob/master/las_vegas_summary.ipynb
# Hope the format is alright and I'd be glad for many comments regarding my code! :)
# Do help me to learn to become better!
##############################################################################################
######################################### EXPLORING ##########################################
##############################################################################################
import os
import xml.etree.cElementTree as ET
import pprint
import pandas as pd
import re
# importing the data
las_vegas_osm = 'las-vegas_nevada.osm'
## for testing and developing purposes, uncomment the truncated version:
#las_vegas_osm = 'LV_truncated.osm'
def count_tags(filename):
"""Creates a dictionary with the tags present in the dataset, alongside a count for each."""
tag_dict = {}
for event, elem in ET.iterparse(filename):
if elem.tag not in tag_dict:
tag_dict[elem.tag] = 1
elif elem.tag in tag_dict:
tag_dict[elem.tag] += 1
return tag_dict
# checking out some basic stats
file_size = os.path.getsize(las_vegas_osm)
print 'File Size in Bytes:', file_size
print 'File Size in MB: ', file_size / (2**20)
las_vegas_osm_dict = count_tags(las_vegas_osm)
las_vegas_osm_tags = pd.Series(las_vegas_osm_dict, name='tags and their amounts')
print las_vegas_osm_tags
# auditing the street names and creating the dictionary I will use through the analysis
def audit_street_type(street_types, expected, street_name):
"""Checks whether the last word of a string is in a list, if not, it appends to a list.
Checks the last word of a string against a provided list of expected street types,
if it isn't it add the street name to a dictionary that is passed as an input.
Reference: https://www.udacity.com/course/viewer#!/c-ud032-nd/l-768058569/e-865319708/m-900198650
"""
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
found = street_type_re.search(street_name)
if found:
street_type = found.group()
if street_type not in expected:
if street_type not in street_types:
street_types[street_type] = [street_name]
else:
street_types[street_type].append(street_name)
def collect_way_types(filename, expected_types):
"""Searches the tag attributes in an OSM file for street names and adds them to a dictionary.
Takes as input a file and a list of expected types,
Calls the audit_street_type() function,
Returns a dictionary with street types mapping to a list of street name occurences.
Reference: https://www.udacity.com/course/viewer#!/c-ud032-nd/l-768058569/e-865319708/m-900198650
"""
street_types = {}
for event, elem in ET.iterparse(filename, events=('start',)):
if elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == 'name':
street_name = tag.attrib['v']
audit_street_type(street_types, expected_types, street_name)
return street_types
# choosing to exclude the common street types
# at first I run the function without excluding anything
common_types = []
street_types = collect_way_types(las_vegas_osm, common_types)
# While working with the truncated version of the dataset, I chose the threshold of 7
# through checking the results. 10 returned an empty list, 5 included 'Vegas'
# - which I believe is not a valid street name :)
threshold = 7
# updating the common_types variable
for key, value in street_types.items():
if len(value) > threshold:
common_types.append(key)
# calling the function again, now excluding some common street types
street_types = collect_way_types(las_vegas_osm, common_types)
# a function to investigate specific elements (which I did way too much...)
def find_something(filename, regex):
""" Prints the OSM elements matching the regex, and a link to view them online. Returns None."""
import re
flag = False
for event, elem in ET.iterparse(filename, events=('start',)):
if elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == 'name':
if re.search(regex, ET.tostring(tag)):
print "Check ID online at: http://www.openstreetmap.org/way/" + elem.attrib['id'] + '\n'
ET.dump(elem)
flag = True
if not flag:
print "No matching Element was found."
# after checking some elements and the street_types list I concluded that
# these can be safely excluded, because they represent (most probably) valid ways
valid_ways = ['Aisle', 'Alley', 'Bypass', 'Channel', 'Highway', 'Interconnect', 'Loop', 'Monorail', 'Path', 'Paths',
'Route', 'Speedway', 'Walk']
nature_ways = ['Falls', 'Forest', 'Lake', 'Shore', 'Spillway', 'Stream', 'River', 'Thrust', 'Wash']
# creating a new 'exclude' variable and recalculating the street_types dict
exclude = common_types + valid_ways + nature_ways
street_types = collect_way_types(las_vegas_osm, exclude)
##############################################################################################
########################################## CLEANING ##########################################
##############################################################################################
from pprint import pprint
import xml.etree.cElementTree as ET
import re
import codecs
############## step 1 - reduce the file size ##############
OSM_FILE = las_vegas_osm
NEW_FILE = 'cleaning_1.osm'
def get_ways(osm_file, tags=('node', 'way', 'relation')):
"""Filters an OSM file and yields the 'way' elements.
Reference: https://discussions.udacity.com/t/changing-attribute-value-in-xml/44575/6
"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
if elem.tag == 'way':
yield elem
root.clear()
# creating a new file holding only 'way' elements
with open(NEW_FILE, 'w') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
output.write('<osm>\n ')
for i, element in enumerate(get_ways(OSM_FILE)):
output.write(ET.tostring(element, encoding='utf-8'))
output.write('</osm>')
# setting the input file to the previous output file
OSM_FILE = NEW_FILE
NEW_FILE = 'cleaning_2.osm'
common_and_valid_ways = exclude
def select_some_way_elems(osm_file, excluded_ways):
"""Yields way elements which last word (usually the street type) is not in a list to exclude."""
import re
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag == 'way':
for tag in elem.iter():
try:
if tag.attrib['k'] == 'name':
street_name = tag.attrib['v']
found = street_type_re.search(street_name)
street_type = found.group()
if street_type not in excluded_ways:
yield elem
root.clear()
except:
continue
# writing a new document that consists only of those way elements that select_some_way_elems() yields.
with open(NEW_FILE, 'w') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
output.write('<osm>\n ')
for i, element in enumerate(select_some_way_elems(OSM_FILE, common_and_valid_ways)):
output.write(ET.tostring(element, encoding='utf-8'))
output.write('</osm>')
############## step 2 - starting to clean elements ##############
# defining some functions to perform cleaning on the OSM elements
def modify_file(filename, function, *args):
"""Modifies a file according to the output of a function.
Takes as input a file name, a function and its arguments.
Runs the (cleaning) function and writes the output back into the file,
using a temporary file object as intermediate step.
Reference:
http://stackoverflow.com/questions/17646680/writing-back-into-the-same-file-after-reading-from-the-file
"""
import tempfile
import sys
temp_file = tempfile.NamedTemporaryFile(mode = 'r+')
input_file = open(filename, 'r')
for i, element in enumerate(function(*args)):
temp_file.write(ET.tostring(element, encoding='utf-8'))
input_file.close()
temp_file.seek(0)
with open(filename, 'w') as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<osm>\n ')
for line in temp_file:
f.write(line)
f.write('</osm>')
temp_file.close()
def get_id(filename, regex):
"""Returns a list of the IDs of the element(s) matching the specified regex somewhere in their tags."""
import re
elem_id_list = []
for event, elem in ET.iterparse(filename, events=('start',)):
if elem.tag == 'way':
for tag in elem.iter('tag'):
if tag.attrib['k'] == 'name':
if re.search(regex, ET.tostring(tag)):
elem_id_list.append(elem.attrib['id'])
return elem_id_list
def substitute_attrib_value(osm_file, before, after, attrib_key, tags=('way', 'node', 'relation')):
"""Changes text in an attribute to a string defined in 'after'.
Changes the text in a specified attribute of a 'way' tag
that contains the string variable defined in 'before' for a new string defined in 'after'.
Reference:
https://discussions.udacity.com/t/changing-attribute-value-in-xml/44575/6
"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
if elem.tag == 'way':
for tag in elem.iter('tag'):
# not changing the original TIGER data
if ('tiger:' not in tag.attrib['k'] and
re.search(before, ET.tostring(tag))):
tag.set(attrib_key, after)
yield elem
root.clear()
def substitute_smth(osm_file, before, after, attrib_key):
"""Wrapper function: Calls substitute_attrib_value() and modify_file().
Substitutes a 'way' tag attribute for another and writes the changes back.
"""
substitute_attrib_value(osm_file, before, after, attrib_key, tags=('way', 'node', 'relation'))
modify_file(osm_file, substitute_attrib_value, osm_file, before, after, attrib_key)
def add_attribute(osm_file, elem_id, attrib_key, attrib_value, tags=('way', 'node', 'relation')):
"""Adds a tag Element with attribute and value to a 'way' Element specified through an ID.
Reference:
https://discussions.udacity.com/t/changing-attribute-value-in-xml/44575/6
"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
if elem.attrib['id'] == elem_id:
try:
for tag in elem.iter('tag'):
if tag.attrib['k'] == attrib_key and tag.attrib['v'] == attrib_value:
raise Exception('AttributePresentError')
ET.SubElement(elem, 'tag', k=attrib_key, v=attrib_value)
except Exception:
print "The attributes %s=%s are already present in this Element."%(attrib_key, attrib_value)
continue
yield elem
root.clear()
def add_smth(osm_file, elem_id, attrib_key, attrib_value):
"""Wrapper function: Calls add_attribute() and modify_file().
Adds an attribute with value to an existing "way" tag, writes the changed ET back to the file."""
add_attribute(osm_file, elem_id, attrib_key, attrib_value, tags=('way', 'node', 'relation'))
modify_file(osm_file, add_attribute, osm_file, elem_id, attrib_key, attrib_value)
# setting the input file to the previous output file, to write back into the same file
OSM_FILE = 'cleaning_2.osm'
# performing some automated cleaning
for area in street_types['Estates']:
for elem_id in get_id(OSM_FILE, area):
add_smth(OSM_FILE, elem_id, 'place', 'suburb')
add_smth(OSM_FILE, elem_id, 'area', 'yes')
add_smth(OSM_FILE, '27575073', 'building', 'yes')
substitute_smth(OSM_FILE, 'Wonderful Day Driive', 'Wonderful Day Drive', 'v')
substitute_smth(OSM_FILE, 'Wanderlust', 'Wanderlust Court', 'v')
substitute_smth(OSM_FILE, 'Seven Oaks', 'Seven Oaks Way', 'v')
substitute_smth(OSM_FILE, 'Padero', 'North Padero Drive', 'v')
substitute_smth(OSM_FILE, 'Scottyboy', 'Scottyboy Drive', 'v')
substitute_smth(OSM_FILE, 'Seashore', 'Seashore Drive', 'v')
substitute_smth(OSM_FILE, 'S FLore del Sol', 'S Flore del Sol Street', 'v')
substitute_smth(OSM_FILE, street_types['Avenmue'][0], 'West Fenway Park Avenue', 'v')
############################## UPDATING CLEANING FUNCTIONS ##############################
# adapting the function with the newly learned aspects to exclude more inappropriate 'way' tags
def collect_way_types(filename, expected_types):
"""Searches the tag attributes in an OSM file for street names and adds them to a dictionary.
Takes as input a file and a list of expected types, also excludes certain parameters that
either represent a (foreign-language) common street name, or are 'way' tags that are no streets.
Calls the audit_street_type() function,
Returns a dictionary with street types mapping to a list of street name occurences.
"""
street_types = {}
# added these common non-english street names that appear at the beginning of the string
non_eng_street_names = ['Avenida', 'Via', 'Camino', 'Calle', 'Vista', 'Placida']
# here are some attributes that I found define non-street ways, so I exclude Elements containing them
non_street_attribs = ['area', 'building', 'amenity', 'golf', 'railway']
for event, elem in ET.iterparse(filename, events=('start',)):
flag = False
if elem.tag == 'way':
for tag in elem.iter('tag'):
if (tag.attrib['k'] in non_street_attribs) and (tag.attrib['v'] != 'no'):
flag = True
for non_eng_name in non_eng_street_names:
# if a street starts with one of the non-eng names, it is excluded
if tag.attrib['v'].startswith(non_eng_name):
flag = True
if flag == False:
for tag in elem.iter('tag'):
if tag.attrib['k'] == 'name':
street_name = tag.attrib['v']
audit_street_type(street_types, expected_types, street_name)
return street_types
# a dictionary of individually-checked elements that are valid streets,
# but do not have right away obvious valid street-type-names
all_fine = {'Access' : street_types['Access'],
'Oak' : street_types['Oak'],
'Oasis' : street_types['Oasis'],
'Paseo' : street_types['Paseo'],
'Pines' : street_types['Pines'],
'Cottage' : street_types['Cottage'],
'Point' : street_types['Point'],
'Portico' : street_types['Portico'],
'Reef' : street_types['Reef'],
'Sawtooth' : street_types['Sawtooth'],
'Sierra' : street_types['Sierra'],
'Solano' : street_types['Solano'],
'Star' : street_types['Star']}
# elements that should have a tag with 'area=yes' and 'place=suburb' added
add_area_suburb = {'Homestretch' : street_types['Homestretch'],
'Homes' : street_types['Homes'],
'Paradise' : street_types['Paradise'],
'Somerset' : street_types['Somerset']}
# elements that should have a tag with 'area=yes' and one with 'building=yes' added
add_area_building = {'Alex' : street_types['Alex']}
# elements that should have a tag with 'area=yes' added
add_area = {'P' : street_types['P'],
'Wilderness' : street_types['Wilderness']}
# examples of abbreviations that come up and could be substituted
substitute = {'Ave' : street_types['Ave'],
'Hwy' : street_types['Hwy'],
'Rd' : street_types['Rd']}
############## step 3 - some more cleaning ##############
# add area=yes
type_dict = add_area
for key, value in type_dict.items():
for v in enumerate(value):
name = v[1]
for elem_id in get_id(OSM_FILE, name):
add_smth(OSM_FILE, elem_id, 'area', 'yes')
# add area=yes, building=yes
type_dict = add_area_building
for key, value in type_dict.items():
for v in enumerate(value):
name = v[1]
for elem_id in get_id(OSM_FILE, name):
add_smth(OSM_FILE, elem_id, 'area', 'yes')
add_smth(OSM_FILE, elem_id, 'building', 'yes')
# add area=yes, place=suburb
type_dict = add_area_suburb
for key, value in type_dict.items():
for v in enumerate(value):
name = v[1]
for elem_id in get_id(OSM_FILE, name):
add_smth(OSM_FILE, elem_id, 'area', 'yes')
add_smth(OSM_FILE, elem_id, 'place', 'suburb')
# extend street name abbreviations
import re
map_dict = {'Rd' : 'Road', 'Hwy' : 'Highway', 'Ave' : 'Avenue'}
type_dict = substitute
street_re = re.compile(r'[^ ]+[ ]', re.IGNORECASE)
for key, value in type_dict.items():
for v in enumerate(value):
old_name = v[1]
re_li = re.findall(street_re, old_name)
new_name = ''.join(re_li) + map_dict[key]
substitute_smth(OSM_FILE, old_name, new_name, 'v')
# add valid streets to the 'exclude' list
for key in all_fine.keys():
if key not in exclude:
exclude.append(key)
# computing the 'street_types' dict anew with the updated elements, function and 'exclude' list
street_types = collect_way_types(OSM_FILE, exclude)
##############################################################################################
######################### MERGING THE CHANGES WITH THE ORIGINAL FILE #########################
##############################################################################################
# creating a parsed ET from the OSM XML elements that were being cleaned
tree_changes = ET.ElementTree(file=OSM_FILE)
chang_root = tree_changes.getroot()
# creating a list containing all the 'way' elements (= all elements)
changed_elems = chang_root.findall('way')
# creating a dictionary mapping the elements' IDs to the element objects
changes_dict = {}
for elem in changed_elems:
changes_dict[elem.attrib['id']] = elem
def merge_changes(osm_file, changes):
"""Merges the changes applied on the street names back into the original OSM file structure, creating a new file."""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'start' and elem.tag == 'way':
current_id = elem.attrib['id']
if current_id in changes.keys():
elem = changes[current_id]
if event == 'end':
yield elem
root.clear()
ORIG_FILE = las_vegas_osm
NEW_FILE = 'LV_applied_changes.osm'
# running the merge_changes() function, creating a new file that includes the changes computed above
with open(NEW_FILE, 'w') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
output.write('<osm>\n ')
for i, element in enumerate(merge_changes(ORIG_FILE, changes_dict)):
output.write(ET.tostring(element, encoding='utf-8'))
output.write('</osm>')
##############################################################################################
##################################### PORTING TO MONGODB #####################################
##############################################################################################
# Code taken from Lesson 6 and adapted to my situation
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def shape_element(element):
"""Shapes an OSM element according to certain parameters into a valid JSON object.
Reference: https://www.udacity.com/course/viewer#!/c-ud032-nd/l-768058569/e-865240067/m-863660253
"""
node = {}
if element.tag == "node" or element.tag == "way":
node['created'] = {}
node['visible'] = 'true'
node['type'] = element.tag
for key, value in element.attrib.iteritems():
if key in CREATED:
node['created'][key] = value
elif key != 'lat' and key != 'lon':
node[key] = value
try:
node['pos'] = [float(element.attrib['lat']), float(element.attrib['lon'])]
except:
pass
if element.tag == 'way':
node['node_refs'] = {}
nd_list = []
for nd in element.iter('nd'):
nd_list.append(nd.attrib['ref'])
node['node_refs'] = nd_list
# creating the additional dicts
for child in element:
if child.tag == 'tag':
attrib_key = child.attrib['k']
attrib_value = child.attrib['v']
if re.search(r'(\w+:){2}', attrib_key):
continue
if re.search(r':', attrib_key):
separate_by_colon_re = re.compile(r'([\w]+[^:\n])')
key_parts_list = re.findall(separate_by_colon_re, attrib_key)
main_key = key_parts_list.pop(0)
# removing the main key
if len(key_parts_list) == 1:
secondary_key = key_parts_list.pop(0)
if main_key == 'addr':
if 'address' in node:
node['address'][secondary_key] = attrib_value
else:
node['address'] = {}
node['address'][secondary_key] = attrib_value
else:
if main_key in node and type(node[main_key]) == dict:
node[main_key][secondary_key] = attrib_value
### NOTE: Some keys I create with regex as keys for dict might already exist as
### keys one level up. Therefore I added this to not lose the information from there
else:
main_key = main_key+'dict'
node[main_key] = {}
node[main_key][secondary_key] = attrib_value
if main_key not in node:
node[main_key] = {}
node[main_key][secondary_key] = attrib_value
else:
node[attrib_key] = attrib_value
return node
else:
return None
def process_map(file_in, pretty = False):
"""Takes an OSM file as input, restructures the Elements to JSON objects and writes a new .json file."""
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
# calling the function to create the .json file
json_struct = process_map('las-vegas_nevada.osm')