Exemplo n.º 1
0
def iter_osm_notes(feed_limit=25, interval=60, parse_timestamps=True):
    """ Parses the global OSM Notes feed and yields as much Note information as possible. """

    last_seen_guid = None
    while True:
        u = requests.get(
            'https://www.openstreetmap.org/api/0.6/notes/feed',
            params=dict(limit=feed_limit),
        )
        u.raise_for_status()

        tree = etree.fromstring(u.content)

        new_notes = []
        for note_item in tree.xpath('/rss/channel/item'):
            title = note_item.xpath('title')[0].text

            if title.startswith('new note ('):
                action = 'create'
            elif title.startswith('new comment ('):
                action = 'comment'
            elif title.startswith('closed note ('):
                action = 'close'

            # Note that (at least for now) the link and guid are the same in the feed.
            guid = note_item.xpath('link')[0].text

            if last_seen_guid == guid:
                break
            elif last_seen_guid is None:
                # The first time through we want the first item to be the "last seen"
                # because the RSS feed is newest-to-oldest
                last_seen_guid = guid
            else:
                note_id = int(guid.split('/')[-1].split('#c')[0])
                new_notes.append((action, get_note(note_id, parse_timestamps)))

        # We yield the reversed list because we want to yield in change order
        # (i.e. "oldest to most current")
        for note in reversed(new_notes):
            yield note

        yield model.Finished(None, None)

        time.sleep(interval)
Exemplo n.º 2
0
def iter_changeset_stream(
        start_sqn=None,
        base_url='https://planet.openstreetmap.org/replication/changesets',
        expected_interval=60,
        parse_timestamps=True,
        state_dir=None):
    """Start processing an OSM changeset stream and yield one (action, primitive) tuple
    at a time to the caller."""

    # This is a lot like the other osm_stream except there's no
    # state file for each of the diffs, so just push ahead until
    # we run into a 404.

    # If the user specifies a state_dir, read the state from the statefile there
    if state_dir:
        if not os.path.exists(state_dir):
            raise Exception('Specified state_dir "%s" doesn\'t exist.' %
                            state_dir)

        if os.path.exists('%s/state.yaml' % state_dir):
            with open('%s/state.yaml' % state_dir, 'r') as f:
                state = readState(f, ': ')
                start_sqn = state['sequence']

    # If no start_sqn, assume to start from the most recent changeset file
    if not start_sqn:
        u = requests.get('%s/state.yaml' % base_url)
        u.raise_for_status()
        state = readState(u.text, ': ')
        sequenceNumber = int(state['sequence'])
    else:
        sequenceNumber = int(start_sqn)

    interval_fudge = 0.0
    while True:
        sqnStr = str(sequenceNumber).zfill(9)
        url = '%s/%s/%s/%s.osm.gz' % (base_url, sqnStr[0:3], sqnStr[3:6],
                                      sqnStr[6:9])

        delay = 1.0
        while True:
            content = requests.get(url)

            if content.status_code == 404:
                time.sleep(delay)
                delay = min(delay * 2, 13)
                interval_fudge += delay
                continue

            content = io.BytesIO(content.content)
            gzipper = gzip.GzipFile(fileobj=content)
            interval_fudge -= (interval_fudge / 2.0)
            break

        obj = None
        for event, elem in etree.iterparse(gzipper, events=('start', 'end')):
            if event == 'start':
                if elem.tag == 'changeset':
                    obj = model.Changeset(
                        int(elem.attrib['id']),
                        isoToDatetime(elem.attrib.get('created_at'))
                        if parse_timestamps else elem.attrib.get('created_at'),
                        isoToDatetime(elem.attrib.get('closed_at'))
                        if parse_timestamps else elem.attrib.get('closed_at'),
                        maybeBool(elem.attrib['open']),
                        maybeFloat(elem.get('min_lat')),
                        maybeFloat(elem.get('max_lat')),
                        maybeFloat(elem.get('min_lon')),
                        maybeFloat(elem.get('max_lon')),
                        elem.attrib.get('user'),
                        maybeInt(elem.attrib.get('uid')), [])
                elif elem.tag == 'tag':
                    obj.tags.append(
                        model.Tag(elem.attrib['k'], elem.attrib['v']))
            elif event == 'end':
                if elem.tag == 'changeset':
                    yield obj
                    obj = None

        yield model.Finished(sequenceNumber, None)

        sequenceNumber += 1

        if state_dir:
            with open('%s/state.yaml' % state_dir, 'w') as f:
                f.write('sequence: %d' % sequenceNumber)
Exemplo n.º 3
0
def iter_osm_stream(
        start_sqn=None,
        base_url='https://planet.openstreetmap.org/replication/minute',
        expected_interval=60,
        parse_timestamps=True,
        state_dir=None):
    """Start processing an OSM diff stream and yield one changeset at a time to
    the caller."""

    # If the user specifies a state_dir, read the state from the statefile there
    if state_dir:
        if not os.path.exists(state_dir):
            raise Exception('Specified state_dir "%s" doesn\'t exist.' %
                            state_dir)

        if os.path.exists('%s/state.txt' % state_dir):
            with open('%s/state.txt' % state_dir) as f:
                state = readState(f)
                start_sqn = state['sequenceNumber']

    # If no start_sqn, assume to start from the most recent diff
    if not start_sqn:
        u = requests.get('%s/state.txt' % base_url)
        state = readState(u.text)
    else:
        sqnStr = str(start_sqn).zfill(9)
        u = requests.get('%s/%s/%s/%s.state.txt' %
                         (base_url, sqnStr[0:3], sqnStr[3:6], sqnStr[6:9]))
        state = readState(u.text)

    interval_fudge = 0.0

    while True:
        sqnStr = state['sequenceNumber'].zfill(9)
        url = '%s/%s/%s/%s.osc.gz' % (base_url, sqnStr[0:3], sqnStr[3:6],
                                      sqnStr[6:9])
        content = requests.get(url)
        content = io.BytesIO(content.content)
        gzipper = gzip.GzipFile(fileobj=content)

        for a in iter_osm_change_file(gzipper, parse_timestamps):
            yield a

        # After parsing the OSC, check to see how much time is remaining
        stateTs = datetime.datetime.strptime(state['timestamp'],
                                             "%Y-%m-%dT%H:%M:%SZ")
        yield (None, model.Finished(state['sequenceNumber'], stateTs))

        nextTs = stateTs + datetime.timedelta(seconds=expected_interval +
                                              interval_fudge)
        if datetime.datetime.utcnow() < nextTs:
            timeToSleep = (nextTs - datetime.datetime.utcnow()).total_seconds()
        else:
            timeToSleep = 0.0
        time.sleep(timeToSleep)

        # Then try to fetch the next state file
        sqnStr = str(int(state['sequenceNumber']) + 1).zfill(9)
        url = '%s/%s/%s/%s.state.txt' % (base_url, sqnStr[0:3], sqnStr[3:6],
                                         sqnStr[6:9])
        delay = 1.0
        while True:
            u = requests.get(url)

            if u.status_code == 404:
                time.sleep(delay)
                delay = min(delay * 2, 13)
                interval_fudge += delay
                continue

            interval_fudge -= (interval_fudge / 2.0)
            break

        if state_dir:
            with open('%s/state.txt' % state_dir, 'w') as f:
                f.write(u.text)
            with open('%s/state.txt' % state_dir, 'r') as f:
                state = readState(f)
        else:
            state = readState(u.text)