def iter_osm_notes(feed_limit=25, interval=60, parse_timestamps=True): """ Parses the global OSM Notes feed and yields as much Note information as possible. """ last_seen_guid = None while True: u = requests.get( 'https://www.openstreetmap.org/api/0.6/notes/feed', params=dict(limit=feed_limit), ) u.raise_for_status() tree = etree.fromstring(u.content) new_notes = [] for note_item in tree.xpath('/rss/channel/item'): title = note_item.xpath('title')[0].text if title.startswith('new note ('): action = 'create' elif title.startswith('new comment ('): action = 'comment' elif title.startswith('closed note ('): action = 'close' # Note that (at least for now) the link and guid are the same in the feed. guid = note_item.xpath('link')[0].text if last_seen_guid == guid: break elif last_seen_guid is None: # The first time through we want the first item to be the "last seen" # because the RSS feed is newest-to-oldest last_seen_guid = guid else: note_id = int(guid.split('/')[-1].split('#c')[0]) new_notes.append((action, get_note(note_id, parse_timestamps))) # We yield the reversed list because we want to yield in change order # (i.e. "oldest to most current") for note in reversed(new_notes): yield note yield model.Finished(None, None) time.sleep(interval)
def iter_changeset_stream( start_sqn=None, base_url='https://planet.openstreetmap.org/replication/changesets', expected_interval=60, parse_timestamps=True, state_dir=None): """Start processing an OSM changeset stream and yield one (action, primitive) tuple at a time to the caller.""" # This is a lot like the other osm_stream except there's no # state file for each of the diffs, so just push ahead until # we run into a 404. # If the user specifies a state_dir, read the state from the statefile there if state_dir: if not os.path.exists(state_dir): raise Exception('Specified state_dir "%s" doesn\'t exist.' % state_dir) if os.path.exists('%s/state.yaml' % state_dir): with open('%s/state.yaml' % state_dir, 'r') as f: state = readState(f, ': ') start_sqn = state['sequence'] # If no start_sqn, assume to start from the most recent changeset file if not start_sqn: u = requests.get('%s/state.yaml' % base_url) u.raise_for_status() state = readState(u.text, ': ') sequenceNumber = int(state['sequence']) else: sequenceNumber = int(start_sqn) interval_fudge = 0.0 while True: sqnStr = str(sequenceNumber).zfill(9) url = '%s/%s/%s/%s.osm.gz' % (base_url, sqnStr[0:3], sqnStr[3:6], sqnStr[6:9]) delay = 1.0 while True: content = requests.get(url) if content.status_code == 404: time.sleep(delay) delay = min(delay * 2, 13) interval_fudge += delay continue content = io.BytesIO(content.content) gzipper = gzip.GzipFile(fileobj=content) interval_fudge -= (interval_fudge / 2.0) break obj = None for event, elem in etree.iterparse(gzipper, events=('start', 'end')): if event == 'start': if elem.tag == 'changeset': obj = model.Changeset( int(elem.attrib['id']), isoToDatetime(elem.attrib.get('created_at')) if parse_timestamps else elem.attrib.get('created_at'), isoToDatetime(elem.attrib.get('closed_at')) if parse_timestamps else elem.attrib.get('closed_at'), maybeBool(elem.attrib['open']), maybeFloat(elem.get('min_lat')), maybeFloat(elem.get('max_lat')), maybeFloat(elem.get('min_lon')), maybeFloat(elem.get('max_lon')), elem.attrib.get('user'), maybeInt(elem.attrib.get('uid')), []) elif elem.tag == 'tag': obj.tags.append( model.Tag(elem.attrib['k'], elem.attrib['v'])) elif event == 'end': if elem.tag == 'changeset': yield obj obj = None yield model.Finished(sequenceNumber, None) sequenceNumber += 1 if state_dir: with open('%s/state.yaml' % state_dir, 'w') as f: f.write('sequence: %d' % sequenceNumber)
def iter_osm_stream( start_sqn=None, base_url='https://planet.openstreetmap.org/replication/minute', expected_interval=60, parse_timestamps=True, state_dir=None): """Start processing an OSM diff stream and yield one changeset at a time to the caller.""" # If the user specifies a state_dir, read the state from the statefile there if state_dir: if not os.path.exists(state_dir): raise Exception('Specified state_dir "%s" doesn\'t exist.' % state_dir) if os.path.exists('%s/state.txt' % state_dir): with open('%s/state.txt' % state_dir) as f: state = readState(f) start_sqn = state['sequenceNumber'] # If no start_sqn, assume to start from the most recent diff if not start_sqn: u = requests.get('%s/state.txt' % base_url) state = readState(u.text) else: sqnStr = str(start_sqn).zfill(9) u = requests.get('%s/%s/%s/%s.state.txt' % (base_url, sqnStr[0:3], sqnStr[3:6], sqnStr[6:9])) state = readState(u.text) interval_fudge = 0.0 while True: sqnStr = state['sequenceNumber'].zfill(9) url = '%s/%s/%s/%s.osc.gz' % (base_url, sqnStr[0:3], sqnStr[3:6], sqnStr[6:9]) content = requests.get(url) content = io.BytesIO(content.content) gzipper = gzip.GzipFile(fileobj=content) for a in iter_osm_change_file(gzipper, parse_timestamps): yield a # After parsing the OSC, check to see how much time is remaining stateTs = datetime.datetime.strptime(state['timestamp'], "%Y-%m-%dT%H:%M:%SZ") yield (None, model.Finished(state['sequenceNumber'], stateTs)) nextTs = stateTs + datetime.timedelta(seconds=expected_interval + interval_fudge) if datetime.datetime.utcnow() < nextTs: timeToSleep = (nextTs - datetime.datetime.utcnow()).total_seconds() else: timeToSleep = 0.0 time.sleep(timeToSleep) # Then try to fetch the next state file sqnStr = str(int(state['sequenceNumber']) + 1).zfill(9) url = '%s/%s/%s/%s.state.txt' % (base_url, sqnStr[0:3], sqnStr[3:6], sqnStr[6:9]) delay = 1.0 while True: u = requests.get(url) if u.status_code == 404: time.sleep(delay) delay = min(delay * 2, 13) interval_fudge += delay continue interval_fudge -= (interval_fudge / 2.0) break if state_dir: with open('%s/state.txt' % state_dir, 'w') as f: f.write(u.text) with open('%s/state.txt' % state_dir, 'r') as f: state = readState(f) else: state = readState(u.text)