Exemplo n.º 1
0
 def test_webmention_endpoint_cache_key(self):
     for expected, url in (
         ('W http foo.com', 'http://foo.com/x'),
         ('W https foo.com', 'https://foo.com/x/y'),
         ('W http foo.com /', 'http://foo.com'),
         ('W http foo.com /', 'http://foo.com/'),
     ):
         got = util.webmention_endpoint_cache_key(url)
         self.assertEquals(expected, got, (url, got))
Exemplo n.º 2
0
 def test_webmention_endpoint_cache_key(self):
   for expected, url in (
       ('W http foo.com', 'http://foo.com/x'),
       ('W https foo.com', 'https://foo.com/x/y'),
       ('W http foo.com /', 'http://foo.com'),
       ('W http foo.com /', 'http://foo.com/'),
   ):
     got = util.webmention_endpoint_cache_key(url)
     self.assertEquals(expected, got, (url, got))
Exemplo n.º 3
0
    def restart(self):
        """Moves status and targets to 'new' and adds a propagate task."""
        self.status = 'new'
        self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                       self.failed + self.skipped)
        self.sent = self.error = self.failed = self.skipped = []

        # clear any cached webmention endpoints
        memcache.delete_multi(
            util.webmention_endpoint_cache_key(url) for url in self.unsent)

        @ndb.transactional
        def finish():
            self.put()
            self.add_task(transactional=True)

        finish()
Exemplo n.º 4
0
  def restart(self):
    """Moves status and targets to 'new' and adds a propagate task."""
    self.status = 'new'
    self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                   self.failed + self.skipped)
    self.sent = self.error = self.failed = self.skipped = []

    # clear any cached webmention endpoints
    with util.webmention_endpoint_cache_lock:
      for url in self.unsent:
        util.webmention_endpoint_cache.pop(util.webmention_endpoint_cache_key(url), None)

    # this datastore put and task add should be transactional, but Cloud Tasks
    # doesn't support that :(
    # https://cloud.google.com/appengine/docs/standard/python/taskqueue/push/migrating-push-queues#features-not-available
    self.put()
    self.add_task()
Exemplo n.º 5
0
  def restart(self):
    """Moves status and targets to 'new' and adds a propagate task."""
    self.status = 'new'
    self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                   self.failed + self.skipped)
    self.sent = self.error = self.failed = self.skipped = []

    # clear any cached webmention endpoints
    memcache.delete_multi(util.webmention_endpoint_cache_key(url)
                          for url in self.unsent)

    @ndb.transactional
    def finish():
      self.put()
      self.add_task(transactional=True)

    finish()
Exemplo n.º 6
0
  def post(self):
    entity = ndb.Key(urlsafe=util.get_required_param(self, 'key')).get()
    if not entity:
      self.abort(400, 'key not found')

    # start all target URLs over
    if entity.status == 'complete':
      entity.status = 'new'

    targets = set(entity.unsent + entity.sent + entity.skipped + entity.error +
                  entity.failed)
    entity.sent = entity.skipped = entity.error = entity.failed = []

    # run OPD to pick up any new SyndicatedPosts. note that we don't refetch
    # their h-feed, so if they've added a syndication URL since we last crawled,
    # retry won't make us pick it up. background in #524.
    if entity.key.kind() == 'Response':
      source = entity.source.get()
      for activity in [json.loads(a) for a in entity.activities_json]:
        originals, mentions = original_post_discovery.discover(
          source, activity, fetch_hfeed=False, include_redirect_sources=False)
        targets |= original_post_discovery.targets_for_response(
          json.loads(entity.response_json), originals=originals, mentions=mentions)

    entity.unsent = targets
    entity.put()

    # clear any cached webmention endpoints
    memcache.delete_multi(util.webmention_endpoint_cache_key(url) for url in targets)

    if entity.key.kind() == 'Response':
      util.add_propagate_task(entity)
    elif entity.key.kind() == 'BlogPost':
      util.add_propagate_blogpost_task(entity)
    else:
      self.abort(400, 'Unexpected key kind %s', entity.key.kind())

    self.messages.add('Retrying. Refresh in a minute to see the results!')
    self.redirect(self.request.get('redirect_to').encode('utf-8') or
                  entity.source.get().bridgy_url(self))
Exemplo n.º 7
0
  def post(self):
    entity = ndb.Key(urlsafe=util.get_required_param(self, 'key')).get()
    if not entity:
      self.abort(400, 'key not found')

    # start all target URLs over
    if entity.status == 'complete':
      entity.status = 'new'

    targets = set(entity.unsent + entity.sent + entity.skipped + entity.error +
                  entity.failed)
    entity.sent = entity.skipped = entity.error = entity.failed = []

    # run OPD to pick up any new SyndicatedPosts. note that we don't refetch
    # their h-feed, so if they've added a syndication URL since we last crawled,
    # retry won't make us pick it up. background in #524.
    if entity.key.kind() == 'Response':
      source = entity.source.get()
      for activity in [json.loads(a) for a in entity.activities_json]:
        originals, mentions = original_post_discovery.discover(
          source, activity, fetch_hfeed=False, include_redirect_sources=False)
        targets |= original_post_discovery.targets_for_response(
          json.loads(entity.response_json), originals=originals, mentions=mentions)

    entity.unsent = targets
    entity.put()

    # clear any cached webmention endpoints
    memcache.delete_multi(util.webmention_endpoint_cache_key(url) for url in targets)

    if entity.key.kind() == 'Response':
      util.add_propagate_task(entity)
    elif entity.key.kind() == 'BlogPost':
      util.add_propagate_blogpost_task(entity)
    else:
      self.abort(400, 'Unexpected key kind %s', entity.key.kind())

    self.messages.add('Retrying. Refresh in a minute to see the results!')
    self.redirect(self.request.get('redirect_to').encode('utf-8') or
                  entity.source.get().bridgy_url(self))
Exemplo n.º 8
0
  def restart(self):
    """Moves status and targets to 'new' and adds a propagate task."""
    self.status = 'new'
    self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                   self.failed + self.skipped)
    self.sent = self.error = self.failed = self.skipped = []

    # clear any cached webmention endpoints
    with util.webmention_endpoint_cache_lock:
      for url in self.unsent:
        util.webmention_endpoint_cache.pop(util.webmention_endpoint_cache_key(url), None)

    # this datastore put and task add should be transactional, but Cloud Tasks
    # doesn't support that :(
    # https://cloud.google.com/appengine/docs/standard/python/taskqueue/push/migrating-push-queues#features-not-available
    # https://github.com/googleapis/python-tasks/issues/26
    #
    # The new "bundled services" bridge for the old App Engine APIs still
    # supports them, but only because that's literally on the old backends,
    # which seems like a dead end.
    # https://groups.google.com/g/google-appengine/c/22BKInlWty0/m/05ObNEdsAgAJ
    self.put()
    self.add_task()
Exemplo n.º 9
0
  def do_send_webmentions(self):
    urls = self.entity.unsent + self.entity.error + self.entity.failed
    unsent = set()
    self.entity.error = []
    self.entity.failed = []

    for orig_url in urls:
      # recheck the url here since the checks may have failed during the poll
      # or streaming add.
      url, domain, ok = util.get_webmention_target(orig_url)
      if ok:
        if len(url) <= _MAX_STRING_LENGTH:
          unsent.add(url)
        else:
          logging.warning('Giving up on target URL over %s chars! %s',
                          _MAX_STRING_LENGTH, url)
          self.entity.failed.append(orig_url)
    self.entity.unsent = sorted(unsent)

    while self.entity.unsent:
      target = self.entity.unsent.pop(0)
      source_url = self.source_url(target)
      logging.info('Webmention from %s to %s', source_url, target)

      # see if we've cached webmention discovery for this domain. the cache
      # value is a string URL endpoint if discovery succeeded, a
      # WebmentionSend error dict if it failed (semi-)permanently, or None.
      cache_key = util.webmention_endpoint_cache_key(target)
      cached = memcache.get(cache_key)
      if cached:
        logging.info('Using cached webmention endpoint %r: %s', cache_key, cached)

      # send! and handle response or error
      error = None
      if isinstance(cached, dict):
        error = cached
      else:
        mention = send.WebmentionSend(source_url, target, endpoint=cached)
        logging.info('Sending...')
        try:
          if not mention.send(timeout=999, headers=util.USER_AGENT_HEADER):
            error = mention.error
        except BaseException, e:
          logging.warning('', exc_info=True)
          error = getattr(mention, 'error')
          if not error:
            error = ({'code': 'BAD_TARGET_URL', 'http_status': 499}
                     if 'DNS lookup failed for URL:' in str(e)
                     else {'code': 'EXCEPTION'})

      if not cached:
        val = (error if error and error['code'] in ('NO_ENDPOINT', 'BAD_TARGET_URL')
               else mention.receiver_endpoint)
        memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME)

      if error is None:
        logging.info('Sent! %s', mention.response)
        self.record_source_webmention(mention)
        self.entity.sent.append(target)
      else:
        code = error['code']
        status = error.get('http_status', 0)
        if (code == 'NO_ENDPOINT' or
            (code == 'BAD_TARGET_URL' and status == 204)):  # 204 is No Content
          logging.info('Giving up this target. %s', error)
          self.entity.skipped.append(target)
        elif status // 100 == 4:
          # Give up on 4XX errors; we don't expect later retries to succeed.
          logging.info('Giving up this target. %s', error)
          self.entity.failed.append(target)
        else:
          self.fail('Error sending to endpoint: %s' % error)
          self.entity.error.append(target)

      if target in self.entity.unsent:
        self.entity.unsent.remove(target)
Exemplo n.º 10
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.info('Giving up on target URL over %s chars! %s',
                                 _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info('Webmention from %s to %s', source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, a
            # WebmentionSend error dict if it failed (semi-)permanently, or None.
            cache_key = util.webmention_endpoint_cache_key(target)
            cached = util.webmention_endpoint_cache.get(cache_key)
            if cached:
                logging.info('Using cached webmention endpoint %r: %s',
                             cache_key, cached)

            # send! and handle response or error
            error = None
            if isinstance(cached, dict):
                error = cached
            else:
                mention = send.WebmentionSend(source_url,
                                              target,
                                              endpoint=cached)
                headers = util.request_headers(source=self.source)
                logging.info('Sending...')
                try:
                    if not mention.send(timeout=999, headers=headers):
                        error = mention.error
                except BaseException as e:
                    logging.info('', stack_info=True)
                    error = getattr(mention, 'error')
                    if not error:
                        error = ({
                            'code': 'BAD_TARGET_URL',
                            'http_status': 499
                        } if 'DNS lookup failed for URL:' in str(e) else {
                            'code': 'EXCEPTION'
                        })

            error_code = error['code'] if error else None
            if error_code != 'BAD_TARGET_URL' and not cached:
                val = error if error_code == 'NO_ENDPOINT' else mention.receiver_endpoint
                with util.webmention_endpoint_cache_lock:
                    util.webmention_endpoint_cache[cache_key] = val

            if error is None:
                logging.info('Sent! %s', mention.response)
                self.record_source_webmention(mention)
                self.entity.sent.append(target)
            else:
                status = error.get('http_status', 0)
                if (error_code == 'NO_ENDPOINT'
                        or (error_code == 'BAD_TARGET_URL'
                            and status == 204)):  # No Content
                    logging.info('Giving up this target. %s', error)
                    self.entity.skipped.append(target)
                elif status // 100 == 4:
                    # Give up on 4XX errors; we don't expect later retries to succeed.
                    logging.info('Giving up this target. %s', error)
                    self.entity.failed.append(target)
                else:
                    self.fail('Error sending to endpoint: %s' % error,
                              level=logging.INFO)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)

        if self.entity.error:
            logging.info('Propagate task failed')
            self.release('error')
        else:
            self.complete()
Exemplo n.º 11
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info("Webmention from %s to %s", source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, a
            # WebmentionSend error dict if it failed (semi-)permanently, or None.
            cache_key = util.webmention_endpoint_cache_key(target)
            cached = memcache.get(cache_key)
            if cached:
                logging.info("Using cached webmention endpoint %r: %s", cache_key, cached)

            # send! and handle response or error
            error = None
            if isinstance(cached, dict):
                error = cached
            else:
                mention = send.WebmentionSend(source_url, target, endpoint=cached)
                logging.info("Sending...")
                try:
                    if not mention.send(timeout=999, headers=util.REQUEST_HEADERS):
                        error = mention.error
                except BaseException, e:
                    logging.warning("", exc_info=True)
                    error = getattr(mention, "error")
                    if not error:
                        error = (
                            {"code": "BAD_TARGET_URL", "http_status": 499}
                            if "DNS lookup failed for URL:" in str(e)
                            else {"code": "EXCEPTION"}
                        )

            error_code = error["code"] if error else None
            if error_code != "BAD_TARGET_URL" and not cached:
                val = error if error_code == "NO_ENDPOINT" else mention.receiver_endpoint
                memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME)

            if error is None:
                logging.info("Sent! %s", mention.response)
                self.record_source_webmention(mention)
                self.entity.sent.append(target)
            else:
                status = error.get("http_status", 0)
                if error_code == "NO_ENDPOINT" or (error_code == "BAD_TARGET_URL" and status == 204):  # No Content
                    logging.info("Giving up this target. %s", error)
                    self.entity.skipped.append(target)
                elif status // 100 == 4:
                    # Give up on 4XX errors; we don't expect later retries to succeed.
                    logging.info("Giving up this target. %s", error)
                    self.entity.failed.append(target)
                else:
                    self.fail("Error sending to endpoint: %s" % error)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)
Exemplo n.º 12
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.info('Giving up on target URL over %s chars! %s',
                                 _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info('Webmention from %s to %s', source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, NO_ENDPOINT if
            # no endpoint was ofund.
            cache_key = util.webmention_endpoint_cache_key(target)
            endpoint = util.webmention_endpoint_cache.get(cache_key)
            if endpoint:
                logging.info('Using cached webmention endpoint %r: %s',
                             cache_key, endpoint)

            # send! and handle response or error
            try:
                resp = None
                headers = util.request_headers(source=self.source)
                if not endpoint:
                    endpoint, resp = webmention.discover(target,
                                                         headers=headers)
                    with util.webmention_endpoint_cache_lock:
                        util.webmention_endpoint_cache[
                            cache_key] = endpoint or NO_ENDPOINT

                if endpoint and endpoint != NO_ENDPOINT:
                    logging.info('Sending...')
                    resp = webmention.send(endpoint,
                                           source_url,
                                           target,
                                           timeout=999,
                                           headers=headers)
                    logging.info('Sent! %s', resp)
                    self.record_source_webmention(endpoint, target)
                    self.entity.sent.append(target)
                else:
                    logging.info('Giving up this target.')
                    self.entity.skipped.append(target)

            except ValueError:
                logging.info('Bad URL; giving up this target.')
                self.entity.skipped.append(target)

            except BaseException as e:
                logging.info('', exc_info=True)
                # Give up on 4XX and DNS errors; we don't expect retries to succeed.
                code, _ = util.interpret_http_exception(e)
                if (code and
                        code.startswith('4')) or 'DNS lookup failed' in str(e):
                    logging.info('Giving up this target.')
                    self.entity.failed.append(target)
                else:
                    self.fail(f'Error sending to endpoint: {resp}',
                              level=logging.INFO)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)

        if self.entity.error:
            logging.info('Propagate task failed')
            self.release('error')
        else:
            self.complete()