def lease(self, key): """Attempts to acquire and lease the :class:`models.Webmentions` entity. Returns True on success, False or None otherwise. TODO: unify with :meth:`complete()` Args: key: :class:`ndb.Key` """ self.entity = key.get() if self.entity is None: self.fail('no entity!') elif self.entity.status == 'complete': # let this task return 200 and finish logging.warning('duplicate task already propagated this') elif (self.entity.status == 'processing' and util.now_fn() < self.entity.leased_until): self.fail('duplicate task is currently processing!') else: assert self.entity.status in ('new', 'processing', 'error'), self.entity.status self.entity.status = 'processing' self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH self.entity.put() return True
def lease(self, key): """Attempts to acquire and lease the :class:`models.Webmentions` entity. Returns True on success, False or None otherwise. TODO: unify with :meth:`complete()` Args: key: :class:`ndb.Key` """ self.entity = key.get() if self.entity is None: self.fail("no entity!") elif self.entity.status == "complete": # let this task return 200 and finish logging.warning("duplicate task already propagated this") elif self.entity.status == "processing" and util.now_fn() < self.entity.leased_until: self.fail("duplicate task is currently processing!") else: assert self.entity.status in ("new", "processing", "error"), self.entity.status self.entity.status = "processing" self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH self.entity.put() return True
def setup_instagram(self, batch_size=None, weekday=0): if batch_size: self.mox.stubs.Set(cron.UpdateInstagramPictures, 'BATCH', batch_size) self.mox.StubOutWithMock(util, 'now_fn') # 2017-01-02 is a Monday, which datetime.weekday() returns 0 for util.now_fn().AndReturn(datetime.datetime(2017, 1, 2 + weekday))
def lease(self, key): """Attempts to acquire and lease the Webmentions entity. Returns True on success, False or None otherwise. TODO: unify with complete() Args: key: ndb.Key """ self.entity = key.get() if self.entity is None: self.fail('no entity!') elif self.entity.status == 'complete': # let this task return 200 and finish logging.warning('duplicate task already propagated this') elif (self.entity.status == 'processing' and util.now_fn() < self.entity.leased_until): self.fail('duplicate task is currently processing!') else: assert self.entity.status in ('new', 'processing', 'error') self.entity.status = 'processing' self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH self.entity.put() return True
def setup_instagram(self, batch_size=None, weekday=0): self.mox.stubs.Set(appengine_config, 'INSTAGRAM_SESSIONID_COOKIE', None) if batch_size: self.mox.stubs.Set(cron.UpdateInstagramPictures, 'BATCH', batch_size) self.mox.StubOutWithMock(util, 'now_fn') # 2017-01-02 is a Monday, which datetime.weekday() returns 0 for util.now_fn().AndReturn(datetime.datetime(2017, 1, 2 + weekday))
def replace_poll_tasks(): """Finds sources missing their poll tasks and adds new ones.""" queries = [ cls.query(Source.features == 'listen', Source.status == 'enabled', Source.last_poll_attempt < util.now_fn() - timedelta(days=2)) for cls in models.sources.values() if cls.AUTO_POLL ] for source in itertools.chain(*queries): age = util.now_fn() - source.last_poll_attempt logger.info( f'{source.bridgy_url()} last polled {age} ago. Adding new poll task.' ) util.add_poll_task(source) return ''
def discover(): source = util.load_source() # validate URL, find silo post url = request.form['url'] domain = util.domain_from_link(url) path = urllib.parse.urlparse(url).path msg = 'Discovering now. Refresh in a minute to see the results!' gr_source = source.gr_source if domain == gr_source.DOMAIN: post_id = gr_source.post_id(url) if post_id: type = 'event' if path.startswith('/events/') else None util.add_discover_task(source, post_id, type=type) else: msg = f"Sorry, that doesn't look like a {gr_source.NAME} post URL." elif util.domain_or_parent_in(domain, source.domains): synd_links = original_post_discovery.process_entry( source, url, {}, False, []) if synd_links: for link in synd_links: util.add_discover_task(source, gr_source.post_id(link)) source.updates = {'last_syndication_url': util.now_fn()} models.Source.put_updates(source) else: msg = f'Failed to fetch {util.pretty_link(url)} or find a {gr_source.NAME} syndication link.' else: msg = f'Please enter a URL on either your web site or {gr_source.NAME}.' flash(msg) return redirect(source.bridgy_url())
def post(self): source = self.load_source() # validate URL, find silo post url = util.get_required_param(self, 'url') domain = util.domain_from_link(url) path = urlparse.urlparse(url).path msg = 'Discovering now. Refresh in a minute to see the results!' if domain == source.GR_CLASS.DOMAIN: post_id = source.GR_CLASS.post_id(url) if post_id: type = 'event' if path.startswith('/events/') else None util.add_discover_task(source, post_id, type=type) else: msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME elif util.domain_or_parent_in(domain, source.domains): synd_links = original_post_discovery.process_entry(source, url, {}, False, []) if synd_links: for link in synd_links: util.add_discover_task(source, source.GR_CLASS.post_id(link)) source.updates = {'last_syndication_url': util.now_fn()} models.Source.put_updates(source) else: msg = 'Failed to fetch %s or find a %s syndication link.' % ( util.pretty_link(url), source.GR_CLASS.NAME) else: msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME self.messages.add(msg) self.redirect(source.bridgy_url(self))
def check_params(params): req = params['app_engine_http_request'] if not check_queue(req['relative_uri']): return False # convert model objects and keys to url-safe key strings for comparison for name, val in kwargs.items(): if isinstance(val, ndb.Model): kwargs[name] = val.key.urlsafe().decode() elif isinstance(val, ndb.Key): kwargs[name] = val.urlsafe().decode() got = set(urllib.parse.parse_qsl(req['body'].decode())) expected = set(kwargs.items()) if got != expected: # print('expect_task: expected %s, got %s' % (expected, got)) return False if eta_seconds is not None: got = params['schedule_time'].seconds - util.to_utc_timestamp( util.now_fn()) delta = eta_seconds * .2 + 10 if not (got + delta >= eta_seconds >= got - delta): # print('expect_task: expected schedule_time %r, got %r' % (eta_seconds, got)) return False return True
def test_discover_url_site_post_syndication_links(self): self.expect_requests_get('http://si.te/123', """ <div class="h-entry"> foo <a class="u-syndication" href="http://fa.ke/222"></a> <a class="u-syndication" href="http://other/silo"></a> <a class="u-syndication" href="http://fa.ke/post/444"></a> </div>""") self.mox.ReplayAll() self.assertEqual(0, SyndicatedPost.query().count()) self.check_discover('http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') self.assertItemsEqual([ {'https://fa.ke/222': 'http://si.te/123'}, {'https://fa.ke/post/444': 'http://si.te/123'}, ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()]) tasks = self.taskqueue_stub.GetTasks('discover') key = self.source.key.urlsafe() self.assertEqual([ {'source_key': key, 'post_id': '222'}, {'source_key': key, 'post_id': '444'}, ], [testutil.get_task_params(task) for task in tasks]) now = util.now_fn() source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def post(self): source = self.load_source() # validate URL, find silo post url = util.get_required_param(self, 'url') domain = util.domain_from_link(url) path = urllib.parse.urlparse(url).path msg = 'Discovering now. Refresh in a minute to see the results!' if domain == source.GR_CLASS.DOMAIN: post_id = source.GR_CLASS.post_id(url) if post_id: type = 'event' if path.startswith('/events/') else None util.add_discover_task(source, post_id, type=type) else: msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME elif util.domain_or_parent_in(domain, source.domains): synd_links = original_post_discovery.process_entry( source, url, {}, False, []) if synd_links: for link in synd_links: util.add_discover_task(source, source.GR_CLASS.post_id(link)) source.updates = {'last_syndication_url': util.now_fn()} models.Source.put_updates(source) else: msg = 'Failed to fetch %s or find a %s syndication link.' % ( util.pretty_link(url), source.GR_CLASS.NAME) else: msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME self.messages.add(msg) self.redirect(source.bridgy_url(self))
def refetch(source): """Refetch the author's URLs and look for new or updated syndication links that might not have been there the first time we looked. Args: source: models.Source subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. Return: a dict of syndicated_url to a list of new models.SyndicatedPosts """ if not source.updates: source.updates = {} logging.debug('attempting to refetch h-feed for %s', source.label()) results = {} for url in _get_author_urls(source): results.update(_process_author(source, url, refetch=True)) now = util.now_fn() logging.debug('updating source last_hfeed_fetch %s', now) source.updates['last_hfeed_fetch'] = now return results
def test_replace_poll_tasks(self): now = util.now_fn() # a bunch of sources, one needs a new poll task five_min_ago = now - datetime.timedelta(minutes=5) day_and_half_ago = now - datetime.timedelta(hours=36) month_ago = now - datetime.timedelta(days=30) defaults = { 'features': ['listen'], 'last_webmention_sent': day_and_half_ago, } self.clear_datastore() sources = [ # doesn't need a new poll task FakeSource.new(last_poll_attempt=now, **defaults).put(), FakeSource.new(last_poll_attempt=five_min_ago, **defaults).put(), FakeSource.new(status='disabled', **defaults).put(), FakeSource.new(status='disabled', **defaults).put(), # need a new poll task FakeSource.new(status='enabled', **defaults).put(), # not signed up for listen FakeSource.new(last_webmention_sent=day_and_half_ago).put(), # never sent a webmention, past grace period. last polled is older than 2x # fast poll, but within 2x slow poll. FakeSource.new(features=['listen'], created=month_ago, last_poll_attempt=day_and_half_ago).put(), ] self.expect_task('poll', source_key=sources[4], last_polled='1970-01-01-00-00-00') self.mox.ReplayAll() resp = self.client.get('/cron/replace_poll_tasks') self.assertEqual(200, resp.status_code)
def test_discover_url_site_post_syndication_links(self): self.expect_requests_get( 'http://si.te/123', """ <div class="h-entry"> foo <a class="u-syndication" href="http://fa.ke/222"></a> <a class="u-syndication" href="http://other/silo"></a> <a class="u-syndication" href="http://fa.ke/post/444"></a> </div>""") self.expect_task('discover', source_key=self.source, post_id='222') self.expect_task('discover', source_key=self.source, post_id='444') self.mox.ReplayAll() self.assertEqual(0, SyndicatedPost.query().count()) self.check_discover( 'http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') self.assertCountEqual([ { 'https://fa.ke/222': 'http://si.te/123' }, { 'https://fa.ke/post/444': 'http://si.te/123' }, ], [{ sp.syndication: sp.original } for sp in models.SyndicatedPost.query()]) now = util.now_fn() source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def test_discover_url_site_post_last_feed_syndication_url(self): now = util.now_fn() self.source.last_feed_syndication_url = now self.source.put() self.expect_requests_get( 'http://si.te/123', """ <div class="h-entry"> <a class="u-syndication" href="http://fa.ke/222"></a> </div>""") self.mox.ReplayAll() self.check_discover( 'http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') tasks = self.taskqueue_stub.GetTasks('discover') key = self.source.key.urlsafe() self.assertEqual([{ 'source_key': key, 'post_id': '222' }], [testutil.get_task_params(task) for task in tasks]) source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. Args: source: models.Source subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: sequence of string original post urls, possibly empty """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # TODO: Consider using the actor's url, with get_author_urls() as the # fallback in the future to support content from non-Bridgy users. results = {} for url in _get_author_urls(source): results.update(_process_author(source, url)) relationships = results.get(syndication_url, []) now = util.now_fn() logging.debug('updating source last_hfeed_fetch %s', now) source.updates['last_hfeed_fetch'] = util.now_fn() if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) originals = [r.original for r in relationships if r.original] if originals: logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, originals) return originals
def source_query(self): now = util.now_fn() since_sun = (now.weekday() * datetime.timedelta(days=1) + (now - now.replace(hour=0, minute=0, second=0))) batch = float(Instagram.query().count()) / self.BATCH offset = batch * float(since_sun.total_seconds()) / self.FREQUENCY.total_seconds() return Instagram.query().fetch(offset=int(math.floor(offset)), limit=int(math.ceil(batch)))
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code == '401' or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, exc_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) if source.is_beta_user(): util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body) elif code in util.HTTP_RATE_LIMIT_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True elif ((code and int(code) / 100 == 5) or (code == '400' and isinstance(source, flickr.Flickr)) or util.is_connection_failure(e)): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, exc_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) if source.is_beta_user(): util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body) elif code in source.RATE_LIMIT_HTTP_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True elif ((code and int(code) / 100 == 5) or (code == '400' and isinstance(source, flickr.Flickr)) or util.is_connection_failure(e)): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self, *path_args): self.request.headers['Content-Type'] = 'application/x-www-form-urlencoded' logging.debug('Params: %s', list(self.request.params.items())) key = self.request.params['source_key'] source = self.source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception as e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, stack_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) elif code in source.RATE_LIMIT_HTTP_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True else: raise finally: source = models.Source.put_updates(source) util.add_poll_task(source) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect()
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect()
def should_refetch(self): """Returns True if we should run OPD refetch on this source now.""" now = util.now_fn() if self.last_hfeed_refetch == REFETCH_HFEED_TRIGGER: return True elif not self.last_syndication_url: return False period = (self.FAST_REFETCH if self.last_syndication_url > now - timedelta(days=14) else self.SLOW_REFETCH) return self.last_poll_attempt >= self.last_hfeed_refetch + period
def lease(self, key): """Attempts to acquire and lease the :class:`models.Webmentions` entity. Also loads and sets `g.source`, and returns False if the source doesn't exist or is disabled. TODO: unify with :meth:`complete()` Args: key: :class:`ndb.Key` Returns: True on success, False or None otherwise """ self.entity = key.get() if self.entity is None: return self.fail('no entity!') elif self.entity.status == 'complete': # let this task return 200 and finish logger.warning('duplicate task already propagated this') return elif (self.entity.status == 'processing' and util.now_fn() < self.entity.leased_until): return self.fail('duplicate task is currently processing!') g.source = self.entity.source.get() if not g.source or g.source.status == 'disabled': logger.error('Source not found or disabled. Dropping task.') return False logger.info( f'Source: {g.source.label()} {g.source.key_id()}, {g.source.bridgy_url()}' ) assert self.entity.status in ('new', 'processing', 'error'), self.entity.status self.entity.status = 'processing' self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH self.entity.put() return True
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform(.8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect()
def post(self, *path_args): logging.debug("Params: %s", self.request.params) key = self.request.params["source_key"] source = ndb.Key(urlsafe=key).get() if not source or source.status == "disabled" or "listen" not in source.features: logging.error("Source not found or disabled. Dropping task.") return logging.info("Source: %s %s, %s", source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params["last_polled"] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning("duplicate poll task! deferring to the other task.") return logging.info( "Last poll: %s/log?start_time=%s&key=%s", self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe(), ) # mark this source as polling source.updates = {"poll_status": "polling", "last_poll_attempt": util.now_fn()} source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates["poll_status"] = "error" code, body = util.interpret_http_exception(e) if code == "401" or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning("Disabling source due to: %s" % e, exc_info=True) source.updates.update({"status": "disabled", "poll_status": "ok"}) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning("Rate limited. Marking as error and finishing. %s", e) source.updates["rate_limited"] = True elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error("API call failed. Marking as error and finishing. %s: %s\n%s", code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise
def record_source_webmention(self, mention): """Sets this source's last_webmention_sent and maybe webmention_endpoint. Args: mention: :class:`webmentiontools.send.WebmentionSend` """ self.source = self.source.key.get() logging.info('Setting last_webmention_sent') self.source.last_webmention_sent = util.now_fn() if (mention.receiver_endpoint != self.source.webmention_endpoint and util.domain_from_link(mention.target_url) in self.source.domains): logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)', mention.receiver_endpoint, mention.target_url, self.source.webmention_endpoint) self.source.webmention_endpoint = mention.receiver_endpoint self.source.put()
def test_poll_period(self): source = FakeSource.new() source.put() self.assertEqual(source.FAST_POLL, source.poll_period()) source.created = datetime(2000, 1, 1, tzinfo=timezone.utc) self.assertEqual(source.SLOW_POLL, source.poll_period()) now = util.now_fn() source.last_webmention_sent = now - timedelta(days=8) self.assertEqual(source.FAST_POLL * 10, source.poll_period()) source.last_webmention_sent = now self.assertEqual(source.FAST_POLL, source.poll_period()) source.rate_limited = True self.assertEqual(source.RATE_LIMITED_POLL, source.poll_period())
def record_source_webmention(self, mention): """Sets this source's last_webmention_sent and maybe webmention_endpoint. Args: mention: webmentiontools.send.WebmentionSend """ self.source = self.source.key.get() logging.info('Setting last_webmention_sent') self.source.last_webmention_sent = util.now_fn() if (mention.receiver_endpoint != self.source.webmention_endpoint and util.domain_from_link(mention.target_url) in self.source.domains): logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)', mention.receiver_endpoint, mention.target_url, self.source.webmention_endpoint) self.source.webmention_endpoint = mention.receiver_endpoint self.source.put()
def record_source_webmention(self, endpoint, target): """Sets this source's last_webmention_sent and maybe webmention_endpoint. Args: endpoint: str, URL target: str, URL """ g.source = g.source.key.get() logger.info('Setting last_webmention_sent') g.source.last_webmention_sent = util.now_fn() if (endpoint != g.source.webmention_endpoint and util.domain_from_link(target) in g.source.domains): logger.info( f'Also setting webmention_endpoint to {endpoint} (discovered in {target}; was {g.source.webmention_endpoint})' ) g.source.webmention_endpoint = endpoint g.source.put()
def record_source_webmention(self, endpoint, target): """Sets this source's last_webmention_sent and maybe webmention_endpoint. Args: endpoint: str, URL target: str, URL """ self.source = self.source.key.get() logging.info('Setting last_webmention_sent') self.source.last_webmention_sent = util.now_fn() if (endpoint != self.source.webmention_endpoint and util.domain_from_link(target) in self.source.domains): logging.info( 'Also setting webmention_endpoint to %s (discovered in %s; was %s)', endpoint, target, self.source.webmention_endpoint) self.source.webmention_endpoint = endpoint self.source.put()
def poll_period(self): """Returns the poll frequency for this source, as a :class:`datetime.timedelta`. Defaults to ~15m, depending on silo. If we've never sent a webmention for this source, or the last one we sent was over a month ago, we drop them down to ~1d after a week long grace period. """ now = util.now_fn() if self.rate_limited: return self.RATE_LIMITED_POLL elif now < self.created + self.FAST_POLL_GRACE_PERIOD: return self.FAST_POLL elif not self.last_webmention_sent: return self.SLOW_POLL elif self.last_webmention_sent > now - timedelta(days=7): return self.FAST_POLL elif self.last_webmention_sent > now - timedelta(days=30): return self.FAST_POLL * 10 else: return self.SLOW_POLL
def test_discover_url_site_post_last_feed_syndication_url(self): now = util.now_fn() self.source.last_feed_syndication_url = now self.source.put() self.expect_requests_get( 'http://si.te/123', """ <div class="h-entry"> <a class="u-syndication" href="http://fa.ke/222"></a> </div>""") self.expect_task('discover', source_key=self.source, post_id='222') self.mox.ReplayAll() self.check_discover( 'http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def test_discover_url_site_post_last_feed_syndication_url(self): now = util.now_fn() self.source.last_feed_syndication_url = now self.source.put() self.expect_requests_get('http://si.te/123', """ <div class="h-entry"> <a class="u-syndication" href="http://fa.ke/222"></a> </div>""") self.mox.ReplayAll() self.check_discover('http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') tasks = self.taskqueue_stub.GetTasks('discover') key = self.source.key.urlsafe() self.assertEqual([{'source_key': key, 'post_id': '222'}], [testutil.get_task_params(task) for task in tasks]) source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def responses(): """Find the most recently attempted responses and blog posts with error URLs.""" entities = [] for cls in (Response,): # BlogPost for e in cls.query().order(-cls.updated): if (len(entities) >= NUM_ENTITIES or e.updated < util.now_fn() - datetime.timedelta(hours=1)): break elif (not e.error and not e.unsent) or e.status == 'complete': continue e.links = [util.pretty_link(u, new_tab=True) for u in e.error + e.failed] if e.key.kind() == 'Response': e.response = json_loads(e.response_json) e.activities = [json_loads(a) for a in e.activities_json] else: e.response = {'content': '[BlogPost]'} e.activities = [{'url': e.key.id()}] entities.append(e) return render_template('admin_responses.html', responses=entities, logs=logs)
def check_task(task): if not task.parent.endswith('/' + queue): # These can help for debugging, but can also be misleading, since many # tests insert multiple tasks, so check_task() runs on all of them (due # to InAnyOrder() below) until it finds one that matches. # print("expect_task: %s doesn't end with /%s!" % (task.parent, queue)) return False req = task.task.app_engine_http_request if not req.relative_uri.endswith('/' + queue): # print("expect_task: relative_uri %s doesn't end with /%s!" % ( # req.relative_uri, queue)) return False # convert model objects and keys to url-safe key strings for comparison for name, val in kwargs.items(): if isinstance(val, ndb.Model): kwargs[name] = val.key.urlsafe().decode() elif isinstance(val, ndb.Key): kwargs[name] = val.urlsafe().decode() got = set(urllib.parse.parse_qsl(req.body.decode())) expected = set(kwargs.items()) if got != expected: # print('expect_task: expected %s, got %s' % (expected, got)) return False if eta_seconds is not None: got = (util.to_utc_timestamp(task.task.schedule_time) - util.to_utc_timestamp(util.now_fn())) delta = eta_seconds * .2 + 10 if not (got + delta >= eta_seconds >= got - delta): # print('expect_task: expected schedule_time %r, got %r' % (eta_seconds, got)) return False return True
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s for this permalink store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Returns: a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug('previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls(source, permalink, set( url for url in usynd if isinstance(url, basestring)), preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url or not feed_entry: # fetch the full permalink page if we think it might have more details parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = util.mf2py_parse(resp.text, permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.info('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls( source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug('saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s for this permalink store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Returns: a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logger.debug( f'previously found relationship(s) for original {permalink}: {synds}' ) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) usynd_urls = {url for url in usynd if isinstance(url, str)} if usynd_urls: logger.debug( f'u-syndication links on the h-feed h-entry: {usynd_urls}') results = _process_syndication_urls(source, permalink, usynd_urls, preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url or not feed_entry: # fetch the full permalink page if we think it might have more details mf2 = None try: if type_ok: logger.debug(f'fetching post permalink {permalink}') mf2 = util.fetch_mf2(permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logger.info(f'Could not fetch permalink {permalink}', exc_info=True) success = False if mf2: syndication_urls = set() relsynd = mf2['rels'].get('syndication', []) if relsynd: logger.debug(f'rel-syndication links: {relsynd}') syndication_urls.update(url for url in relsynd if isinstance(url, str)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in mf2['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logger.debug(f'u-syndication links: {usynd}') syndication_urls.update(url for url in usynd if isinstance(url, str)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = list(itertools.chain(*results.values())) for syndpost in preexisting: if syndpost.syndication and syndpost not in result_syndposts: logger.info( f'deleting relationship that disappeared: {syndpost}') syndpost.key.delete() preexisting.remove(syndpost) if not results: logger.debug( f'no syndication links from {permalink} to current source {source.label()}.' ) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logger.debug( f'saving empty relationship so that {permalink} will not be searched again' ) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.items(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logger.debug(f'discovered relationships {new_results}') return new_results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} logger.debug(f'fetching author url {author_url}') try: author_mf2 = util.fetch_mf2(author_url) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logger.info(f'Could not fetch author url {author_url}', exc_info=True) return {} feeditems = _find_feed_items(author_mf2) # try rel=feeds and rel=alternates feed_urls = set() candidates = (author_mf2['rels'].get('feed', []) + [ a.get('url') for a in author_mf2.get('alternates', []) if a.get('type') == MF2_HTML_MIME_TYPE ]) for feed_url in candidates: # check that it's html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logger.debug('author url is the feed url, ignoring') elif not feed_ok: logger.debug("skipping feed since it's not HTML or otherwise bad") else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logger.debug(f"fetching author's rel-feed {feed_url}") feed_mf2 = util.fetch_mf2(feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logger.info( f'rel-feed found new domain {domain}! adding to source' ) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logger.info(f'Could not fetch h-feed url {feed_url}.', exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') or '' feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logger.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, str): permalink_to_entry[permalink] = child else: logger.warning( f'unexpected non-string "url" property: {permalink}') max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logger.info(f'Hit cap of {max} permalinks. Stopping.') break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.items(): logger.debug(f'processing permalink: {permalink}') new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.items(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def template_vars(self): vars = super(UserHandler, self).template_vars() vars.update({ 'source': self.source, 'EPOCH': util.EPOCH, 'REFETCH_HFEED_TRIGGER': models.REFETCH_HFEED_TRIGGER, 'RECENT_PRIVATE_POSTS_THRESHOLD': RECENT_PRIVATE_POSTS_THRESHOLD, }) if not self.source: return vars if isinstance(self.source, instagram.Instagram): auth = self.source.auth_entity vars['indieauth_me'] = ( auth.id if isinstance(auth, indieauth.IndieAuth) else self.source.domain_urls[0] if self.source.domain_urls else None) # Blog webmention promos if 'webmention' not in self.source.features: if self.source.SHORT_NAME in ('blogger', 'medium', 'tumblr', 'wordpress'): vars[self.source.SHORT_NAME + '_promo'] = True else: for domain in self.source.domains: if ('.blogspot.' in domain and # Blogger uses country TLDs not Blogger.query(Blogger.domains == domain).get()): vars['blogger_promo'] = True elif (domain.endswith('tumblr.com') and not Tumblr.query(Tumblr.domains == domain).get()): vars['tumblr_promo'] = True elif (domain.endswith('wordpress.com') and not WordPress.query(WordPress.domains == domain).get()): vars['wordpress_promo'] = True # Responses if 'listen' in self.source.features: vars['responses'] = [] query = Response.query().filter(Response.source == self.source.key) # if there's a paging param (responses_before or responses_after), update # query with it def get_paging_param(param): val = self.request.get(param) try: return util.parse_iso8601(val) if val else None except: msg = "Couldn't parse %s %r as ISO8601" % (param, val) logging.exception(msg) self.abort(400, msg) before = get_paging_param('responses_before') after = get_paging_param('responses_after') if before and after: self.abort(400, "can't handle both responses_before and responses_after") elif after: query = query.filter(Response.updated > after).order(Response.updated) elif before: query = query.filter(Response.updated < before).order(-Response.updated) else: query = query.order(-Response.updated) query_iter = query.iter() for i, r in enumerate(query_iter): r.response = json.loads(r.response_json) r.activities = [json.loads(a) for a in r.activities_json] if (not self.source.is_activity_public(r.response) or not all(self.source.is_activity_public(a) for a in r.activities)): continue elif r.type == 'post': r.activities = [] r.actor = r.response.get('author') or r.response.get('actor', {}) for a in r.activities + [r.response]: if not a.get('content'): a['content'] = a.get('object', {}).get('content') if not r.response.get('content'): phrases = { 'like': 'liked this', 'repost': 'reposted this', 'rsvp-yes': 'is attending', 'rsvp-no': 'is not attending', 'rsvp-maybe': 'might attend', 'rsvp-interested': 'is interested', 'invite': 'is invited', } r.response['content'] = '%s %s.' % ( r.actor.get('displayName') or '', phrases.get(r.type) or phrases.get(r.response.get('verb'))) # convert image URL to https if we're serving over SSL image_url = r.actor.setdefault('image', {}).get('url') if image_url: r.actor['image']['url'] = util.update_scheme(image_url, self) # generate original post links r.links = self.process_webmention_links(r) r.original_links = [util.pretty_link(url, new_tab=True) for url in r.original_posts] vars['responses'].append(r) if len(vars['responses']) >= 10 or i > 200: break vars['responses'].sort(key=lambda r: r.updated, reverse=True) # calculate new paging param(s) new_after = ( before if before else vars['responses'][0].updated if vars['responses'] and query_iter.probably_has_next() and (before or after) else None) if new_after: vars['responses_after_link'] = ('?responses_after=%s#responses' % new_after.isoformat()) new_before = ( after if after else vars['responses'][-1].updated if vars['responses'] and query_iter.probably_has_next() else None) if new_before: vars['responses_before_link'] = ('?responses_before=%s#responses' % new_before.isoformat()) vars['next_poll'] = max( self.source.last_poll_attempt + self.source.poll_period(), # lower bound is 1 minute from now util.now_fn() + datetime.timedelta(seconds=90)) # Publishes if 'publish' in self.source.features: publishes = Publish.query().filter(Publish.source == self.source.key)\ .order(-Publish.updated)\ .fetch(10) for p in publishes: p.pretty_page = util.pretty_link( p.key.parent().id().decode('utf-8'), attrs={'class': 'original-post u-url u-name'}, new_tab=True) vars['publishes'] = publishes if 'webmention' in self.source.features: # Blog posts blogposts = BlogPost.query().filter(BlogPost.source == self.source.key)\ .order(-BlogPost.created)\ .fetch(10) for b in blogposts: b.links = self.process_webmention_links(b) try: text = b.feed_item.get('title') except ValueError: text = None b.pretty_url = util.pretty_link( b.key.id(), text=text, attrs={'class': 'original-post u-url u-name'}, max_length=40, new_tab=True) # Blog webmentions webmentions = BlogWebmention.query()\ .filter(BlogWebmention.source == self.source.key)\ .order(-BlogWebmention.updated)\ .fetch(10) for w in webmentions: w.pretty_source = util.pretty_link( w.source_url(), attrs={'class': 'original-post'}, new_tab=True) try: target_is_source = (urlparse.urlparse(w.target_url()).netloc in self.source.domains) except BaseException: target_is_source = False w.pretty_target = util.pretty_link( w.target_url(), attrs={'class': 'original-post'}, new_tab=True, keep_host=target_is_source) vars.update({'blogposts': blogposts, 'webmentions': webmentions}) return vars
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}) self.backfeed(source, responses, activities=activities) source.updates.update({'last_polled': source.last_poll_attempt, 'poll_status': 'ok'}) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info('refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache, ) etag = resp.get("etag") # used later user_activities = resp.get("items", []) # these map ids to AS objects responses = {a["id"]: a for a in links} activities = {a["id"]: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates["last_activity_id"] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates["last_activities_cache_json"] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids} ) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info("Found %d public activities: %s", len(public), public.keys()) logging.info("Found %d private activities: %s", len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get("published") for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published)) source.updates["recent_private_posts"] = len( [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post] ) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get("object") or activity # handle user mentions user_id = source.user_tag_id() if obj.get("author", {}).get("id") != user_id: for tag in obj.get("tags", []): urls = tag.get("urls") if tag.get("objectType") == "person" and tag.get("id") == user_id and urls: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) activity["mentions"].update(u.get("value") for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get("attachments", []): if ( att.get("objectType") in ("note", "article") and att.get("author", {}).get("id") == source.user_tag_id() ): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get("replies", {}).get("items", []) tags = obj.get("tags", []) likes = [t for t in tags if Response.get_type(t) == "like"] reactions = [t for t in tags if Response.get_type(t) == "react"] reposts = [t for t in tags if Response.get_type(t) == "repost"] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get("id") if not id: logging.error("Skipping response without id: %s", json.dumps(resp, indent=2)) continue resp.setdefault("activities", []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp) resp["activities"].extend(existing.get("activities", [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen["id"] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop("activities", []) if not activities and resp_type == "post": activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) targets = original_post_discovery.targets_for_response( resp, originals=activity["originals"], mentions=activity["mentions"] ) if targets: logging.info( "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets) ) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t) too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...") # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get("originals", []), ) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses) source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"}) if etag and etag != source.last_activities_etag: source.updates["last_activities_etag"] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info("refetching h-feed for source %s", source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates["last_hfeed_refetch"] = now if relationships: logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if isinstance( e, (datastore_errors.BadRequestError, datastore_errors.Timeout) ) or util.is_connection_failure(e): logging.info("Timeout while repropagating responses.", exc_info=True) else: raise
def template_vars(self): vars = super(UserHandler, self).template_vars() vars.update({ 'source': self.source, 'EPOCH': util.EPOCH, 'REFETCH_HFEED_TRIGGER': models.REFETCH_HFEED_TRIGGER, 'RECENT_PRIVATE_POSTS_THRESHOLD': RECENT_PRIVATE_POSTS_THRESHOLD, }) if not self.source: return vars if isinstance(self.source, instagram.Instagram): auth = self.source.auth_entity vars['indieauth_me'] = ( auth.id if isinstance(auth, indieauth.IndieAuth) else self.source.domain_urls[0] if self.source.domain_urls else None) # Blog webmention promos if 'webmention' not in self.source.features: if self.source.SHORT_NAME in ('blogger', 'tumblr', 'wordpress'): vars[self.source.SHORT_NAME + '_promo'] = True else: for domain in self.source.domains: if ('.blogspot.' in domain and # Blogger uses country TLDs not Blogger.query(Blogger.domains == domain).get()): vars['blogger_promo'] = True elif (domain.endswith('tumblr.com') and not Tumblr.query(Tumblr.domains == domain).get()): vars['tumblr_promo'] = True elif (domain.endswith('wordpress.com') and not WordPress.query(WordPress.domains == domain).get()): vars['wordpress_promo'] = True # Responses if 'listen' in self.source.features: vars['responses'] = [] query = Response.query().filter(Response.source == self.source.key) # if there's a paging param (responses_before or responses_after), update # query with it def get_paging_param(param): val = self.request.get(param) try: return util.parse_iso8601(val) if val else None except: msg = "Couldn't parse %s %r as ISO8601" % (param, val) logging.exception(msg) self.abort(400, msg) before = get_paging_param('responses_before') after = get_paging_param('responses_after') if before and after: self.abort(400, "can't handle both responses_before and responses_after") elif after: query = query.filter(Response.updated > after).order(Response.updated) elif before: query = query.filter(Response.updated < before).order(-Response.updated) else: query = query.order(-Response.updated) query_iter = query.iter() for i, r in enumerate(query_iter): r.response = json.loads(r.response_json) r.activities = [json.loads(a) for a in r.activities_json] if (not self.source.is_activity_public(r.response) or not all(self.source.is_activity_public(a) for a in r.activities)): continue elif r.type == 'post': r.activities = [] r.actor = r.response.get('author') or r.response.get('actor', {}) for a in r.activities + [r.response]: if not a.get('content'): a['content'] = a.get('object', {}).get('content') if not r.response.get('content'): phrases = { 'like': 'liked this', 'repost': 'reposted this', 'rsvp-yes': 'is attending', 'rsvp-no': 'is not attending', 'rsvp-maybe': 'might attend', 'rsvp-interested': 'is interested', 'invite': 'is invited', } r.response['content'] = '%s %s.' % ( r.actor.get('displayName') or '', phrases.get(r.type) or phrases.get(r.response.get('verb'))) # convert image URL to https if we're serving over SSL image_url = r.actor.setdefault('image', {}).get('url') if image_url: r.actor['image']['url'] = util.update_scheme(image_url, self) # generate original post links r.links = self.process_webmention_links(r) r.original_links = [util.pretty_link(url, new_tab=True) for url in r.original_posts] vars['responses'].append(r) if len(vars['responses']) >= 10 or i > 200: break vars['responses'].sort(key=lambda r: r.updated, reverse=True) # calculate new paging param(s) new_after = ( before if before else vars['responses'][0].updated if vars['responses'] and query_iter.probably_has_next() and (before or after) else None) if new_after: vars['responses_after_link'] = ('?responses_after=%s#responses' % new_after.isoformat()) new_before = ( after if after else vars['responses'][-1].updated if vars['responses'] and query_iter.probably_has_next() else None) if new_before: vars['responses_before_link'] = ('?responses_before=%s#responses' % new_before.isoformat()) vars['next_poll'] = max( self.source.last_poll_attempt + self.source.poll_period(), # lower bound is 1 minute from now util.now_fn() + datetime.timedelta(seconds=90)) # Publishes if 'publish' in self.source.features: publishes = Publish.query().filter(Publish.source == self.source.key)\ .order(-Publish.updated)\ .fetch(10) for p in publishes: p.pretty_page = util.pretty_link( p.key.parent().id(), attrs={'class': 'original-post u-url u-name'}, new_tab=True) vars['publishes'] = publishes if 'webmention' in self.source.features: # Blog posts blogposts = BlogPost.query().filter(BlogPost.source == self.source.key)\ .order(-BlogPost.created)\ .fetch(10) for b in blogposts: b.links = self.process_webmention_links(b) try: text = b.feed_item.get('title') except ValueError: text = None b.pretty_url = util.pretty_link( b.key.id(), text=text, attrs={'class': 'original-post u-url u-name'}, max_length=40, new_tab=True) # Blog webmentions webmentions = BlogWebmention.query()\ .filter(BlogWebmention.source == self.source.key)\ .order(-BlogWebmention.updated)\ .fetch(10) for w in webmentions: w.pretty_source = util.pretty_link( w.source_url(), attrs={'class': 'original-post'}, new_tab=True) try: target_is_source = (urlparse.urlparse(w.target_url()).netloc in self.source.domains) except BaseException: target_is_source = False w.pretty_target = util.pretty_link( w.target_url(), attrs={'class': 'original-post'}, new_tab=True, keep_host=target_is_source) vars.update({'blogposts': blogposts, 'webmentions': webmentions}) return vars
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json_loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response(fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = str(id) > str(last_activity_id) if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json_dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) self.backfeed(source, responses, activities=activities) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException as e: if ('BadRequestError' in str(e.__class__) or 'Timeout' in str(e.__class__) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', stack_info=True) else: raise else: logging.info( 'skipping refetch h-feed. last-syndication-url %s, last-refetch %s', source.last_syndication_url, source.last_hfeed_refetch)
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of models.Source author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Return: a dict of syndicated_url to a list of new models.SyndicatedPost """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. # TODO skip sites we know don't have microformats2 markup author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = BeautifulSoup(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.warning('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if not feed_type: # type is not specified, use this to confirm that it's text/html feed_url, _, feed_type_ok = util.get_webmention_target(feed_url) else: feed_type_ok = feed_type == 'text/html' if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_type_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.warning('Could not fetch h-feed url %s.', feed_url, exc_info=True) permalink_to_entry = {} for child in feeditems: if 'h-entry' in child['type']: # TODO maybe limit to first ~30 entries? (do that here rather than, # below because we want the *first* n entries) for permalink in child['properties'].get('url', []): if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = _process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling now = util.now_fn() logging.debug('updating source last_syndication_url %s', now) source.updates['last_syndication_url'] = now return results
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise