예제 #1
0
def test_one_matching_url():
    p = {
        'rels': {},
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            }, {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'name': ['Good'],
                }
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'

    p['items'].append({
        'type': ['h-card'],
        'properties': {
            'url': ['http://foo.com/bar', 'http://flickr.com/photos/foobar'],
            'name': ['Too Many Cooks'],
        }
    })
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert not hcard
예제 #2
0
def test_url_matches_uid():
    p = {
        'rels': {},
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            }, {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'uid': ['http://foo.com/bar'],
                    'name': ['Good'],
                }
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'

    # removing the uid should prevent us from finding the h-card
    del p['items'][1]['properties']['uid']
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert not hcard
예제 #3
0
def test_one_matching_url():
    p = {
        'rels': {},
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            },
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'name': ['Good'],
                }
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'

    p['items'].append({
        'type': ['h-card'],
        'properties': {
            'url': ['http://foo.com/bar', 'http://flickr.com/photos/foobar'],
            'name': ['Too Many Cooks'],
        }
    })
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert not hcard
예제 #4
0
def test_url_matches_uid():
    p = {
        'rels': {},
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            },
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'uid': ['http://foo.com/bar'],
                    'name': ['Good'],
                }
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'

    # removing the uid should prevent us from finding the h-card
    del p['items'][1]['properties']['uid']
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert not hcard
예제 #5
0
def test_url_matches_rel_me():
    # rel-me points to identity hosted on about.me
    p = {
        'rels': {
            'me': ['http://about.me/foobar'],
        },
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            }, {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://about.me/foobar', 'http://tilde.club/~foobar'],
                    'name': ['Good'],
                }
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'
예제 #6
0
def test_nested_hcard():
    p = {
        'rels': {},
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            }, {
                'type': ['h-entry'],
                'children': [
                    {
                        'type': ['h-card'],
                        'properties': {
                            'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                            'uid': ['http://foo.com/bar'],
                            'name': ['Good'],
                        }
                    },
                ]
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'
예제 #7
0
    def get(self, domain):
        url = 'http://%s/' % domain
        resp = common.requests_get(url)
        mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
        # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))

        hcard = mf2util.representative_hcard(mf2, resp.url)
        logging.info('Representative h-card: %s', json.dumps(hcard, indent=2))
        if not hcard:
            common.error(
                self, """\
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s"""
                % resp.url)

        key = MagicKey.get_or_create(domain)
        obj = common.postprocess_as2(as2.from_as1(
            microformats2.json_to_object(hcard)),
                                     key=key)
        obj.update({
            'inbox':
            '%s/%s/inbox' % (appengine_config.HOST_URL, domain),
            'outbox':
            '%s/%s/outbox' % (appengine_config.HOST_URL, domain),
            'following':
            '%s/%s/following' % (appengine_config.HOST_URL, domain),
            'followers':
            '%s/%s/followers' % (appengine_config.HOST_URL, domain),
        })
        logging.info('Returning: %s', json.dumps(obj, indent=2))

        self.response.headers.update({
            'Content-Type': common.CONTENT_TYPE_AS2,
            'Access-Control-Allow-Origin': '*',
        })
        self.response.write(json.dumps(obj, indent=2))
예제 #8
0
def build_user_json(me, resp=None):
    """user_json contains an h-card, rel-me links, and "me"

  Args:
    me: string, URL of the user, returned by
    resp: :class:`requests.Response` (optional), re-use response if it's already
      been fetched

  Return:
    dict, with 'me', the URL for this person; 'h-card', the representative h-card
      for this page; 'rel-me', a list of rel-me URLs found at this page
  """
    user_json = {'me': me}

    resp = resp or util.requests_get(me)
    if resp.status_code // 100 != 2:
        logging.warning('could not fetch user url "%s". got response code: %d',
                        me, resp.status_code)
        return user_json

    mf2 = util.parse_mf2(resp, resp.url)
    user_json['rel-me'] = mf2['rels'].get('me')
    user_json['h-card'] = mf2util.representative_hcard(mf2, me)
    logging.debug('built user-json %r', user_json)
    return util.trim_nulls(user_json)
예제 #9
0
def build_user_json(me, resp=None):
  """user_json contains an h-card, rel-me links, and "me"

  Args:
    me: string, URL of the user, returned by
    resp: requests.Response (optional), re-use response if it's already been fetched

  Return:
    dict, with 'me', the URL for this person; 'h-card', the representative h-card
      for this page; 'rel-me', a list of rel-me URLs found at this page
  """
  user_json = {'me': me}

  resp = resp or util.requests_get(me)
  if resp.status_code // 100 != 2:
    logging.warning(
      'could not fetch user url "%s". got response code: %d',
      me, resp.status_code)
    return user_json
  # Requests doesn't look at the HTML body to find <meta charset>
  # tags, so if the character encoding isn't given in a header, then
  # we pass on the raw bytes and let BS4 deal with it.
  p = mf2py.parse(doc=resp.text
                  if 'charset' in resp.headers.get('content-type', '')
                  else resp.content, url=me)
  user_json['rel-me'] = p.get('rels', {}).get('me')
  user_json['h-card'] = mf2util.representative_hcard(p, me)
  logging.debug('built user-json %r', user_json)
  return util.trim_nulls(user_json)
예제 #10
0
    def fetch(self):
        # fetch the website and parse for microformats
        try:
            parser = mf2py.Parser(url=self.url)
        except:
            return None

        # identify the representative h-card
        parsed = parser.to_dict()
        hcard = mf2util.representative_hcard(parsed, self.url)

        if not hcard:
            hcards = parser.to_dict(filter_by_type='h-card')
            if len(hcards):
                hcard = hcards[0]

        if hcard:
            self.name = hcard['properties'].get('name', [None])[0]
            self.nicknames = hcard['properties'].get('nickname', None)

        # identify rel-me links as pseudonyms
        matches = {}
        for url in parser.to_dict()['rels'].get('me', []):
            match = Pseudonym.identify_url(url, self)
            if not match:
                continue
            if match.target not in self.pseudonyms:
                self.pseudonyms[match.target] = match

        # remember the last time I fetched
        self.timestamp = time.time()

        # save to the database
        self.save()
예제 #11
0
def actor(domain):
    """Serves /[DOMAIN], fetches its mf2, converts to AS Actor, and serves it."""
    tld = domain.split('.')[-1]
    if tld in common.TLD_BLOCKLIST:
        error('', status=404)

    mf2 = util.fetch_mf2(f'http://{domain}/',
                         gateway=True,
                         headers=common.HEADERS)

    hcard = mf2util.representative_hcard(mf2, mf2['url'])
    logging.info(f'Representative h-card: {json_dumps(hcard, indent=2)}')
    if not hcard:
        error(
            f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {mf2['url']}"
        )

    key = MagicKey.get_or_create(domain)
    obj = common.postprocess_as2(as2.from_as1(
        microformats2.json_to_object(hcard)),
                                 key=key)
    obj.update({
        'preferredUsername': domain,
        'inbox': f'{request.host_url}{domain}/inbox',
        'outbox': f'{request.host_url}{domain}/outbox',
        'following': f'{request.host_url}{domain}/following',
        'followers': f'{request.host_url}{domain}/followers',
    })
    logging.info(f'Returning: {json_dumps(obj, indent=2)}')

    return (obj, {
        'Content-Type': common.CONTENT_TYPE_AS2,
        'Access-Control-Allow-Origin': '*',
    })
예제 #12
0
def test_url_matches_rel_me():
    # rel-me points to identity hosted on about.me
    p = {
        'rels': {
            'me': ['http://about.me/foobar'],
        },
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            },
            {
                'type': ['h-card'],
                'properties': {
                    'url':
                    ['http://about.me/foobar', 'http://tilde.club/~foobar'],
                    'name': ['Good'],
                }
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'
    def get(self, domain):
        tld = domain.split('.')[-1]
        if tld in common.TLD_BLOCKLIST:
            self.error('', status=404)

        mf2 = util.fetch_mf2('http://%s/' % domain, gateway=True,
                             headers=common.HEADERS)
        # logging.info('Parsed mf2 for %s: %s', resp.url, json_dumps(mf2, indent=2))

        hcard = mf2util.representative_hcard(mf2, mf2['url'])
        logging.info('Representative h-card: %s', json_dumps(hcard, indent=2))
        if not hcard:
            self.error("""\
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % mf2['url'])

        key = MagicKey.get_or_create(domain)
        obj = self.postprocess_as2(as2.from_as1(microformats2.json_to_object(hcard)),
                                   key=key)
        obj.update({
            'inbox': '%s/%s/inbox' % (self.request.host_url, domain),
            'outbox': '%s/%s/outbox' % (self.request.host_url, domain),
            'following': '%s/%s/following' % (self.request.host_url, domain),
            'followers': '%s/%s/followers' % (self.request.host_url, domain),
        })
        logging.info('Returning: %s', json_dumps(obj, indent=2))

        self.response.headers.update({
            'Content-Type': common.CONTENT_TYPE_AS2,
            'Access-Control-Allow-Origin': '*',
        })
        self.response.write(json_dumps(obj, indent=2))
예제 #14
0
def test_hcard_as_a_property():
    """h-card is the p-author of the primary h-feed
    """
    p = {
        'rels': {},
        'items': [
            {
                'type': ['h-feed'],
                'properties': {
                    'author': [
                        {
                            'type': ['h-card'],
                            'properties': {
                                'name': ['Elliot Alderson'],
                                'url': ['http://foo.com/bar']
                            }
                        }
                    ]
                }
            }
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Elliot Alderson'
예제 #15
0
def test_nested_hcard():
    p = {
        'rels': {},
        'items': [
            {
                'type': ['h-card'],
                'properties': {
                    'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'],
                    'name': ['Bad'],
                }
            },
            {
                'type': ['h-entry'],
                'children': [
                    {
                        'type': ['h-card'],
                        'properties': {
                            'url': [
                                'http://foo.com/bar',
                                'http://tilde.club/~foobar'
                            ],
                            'uid': ['http://foo.com/bar'],
                            'name': ['Good'],
                        }
                    },
                ]
            },
        ]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Good'
예제 #16
0
def login_callback():
    current_app.logger.debug('callback fields: %s', request.args)

    state = request.args.get('state')
    next_url = state or url_for('views.index')
    # TODO rediscover these endpoints based on 'me'. Assuming
    # they are the same is not totally safe.
    auth_url, token_url, micropub_url = session['endpoints']

    if not auth_url:
        flash('Login failed: No authorization URL in session')
        return redirect(next_url)

    code = request.args.get('code')
    client_id = get_settings().site_url
    redirect_uri = url_for('.login_callback', _external=True)

    current_app.logger.debug('callback with auth endpoint %s', auth_url)
    response = requests.post(auth_url, data={
        'code': code,
        'client_id': client_id,
        'redirect_uri': redirect_uri,
        'state': state,
    })

    rdata = urllib.parse.parse_qs(response.text)
    if response.status_code != 200:
        current_app.logger.debug('call to auth endpoint failed %s', response)
        flash('Login failed {}: {}'.format(rdata.get('error'),
                                           rdata.get('error_description')))
        return redirect(next_url)

    current_app.logger.debug('verify response %s', response.text)
    if 'me' not in rdata:
        current_app.logger.debug('Verify response missing required "me" field')
        flash('Verify response missing required "me" field {}'.format(
            response.text))
        return redirect(next_url)

    me = rdata.get('me')[0]
    scopes = rdata.get('scope')

    try_micropub_config(token_url, micropub_url, scopes, code, me,
                        redirect_uri, client_id, state)

    cred = Credential.query.get(('indieauth', me))
    if not cred:
        cred = Credential(type='indieauth', value=me, display=me)
        db.session.add(cred)
        db.session.commit()

    # offer to associate credential with existing user or create a new user
    p = mf2py.parse(url=me)
    hcard = mf2util.representative_hcard(p, me)
    author = hcard and mf2util.parse_author(hcard)

    return do_login(cred, author and author.get('name'), next_url)
예제 #17
0
def test_hcard_as_a_property():
    """h-card is the p-author of the primary h-feed
    """
    p = {
        'rels': {},
        'items': [{
            'type': ['h-feed'],
            'properties': {
                'author': [{
                    'type': ['h-card'],
                    'properties': {
                        'name': ['Elliot Alderson'],
                        'url': ['http://foo.com/bar']
                    }
                }]
            }
        }]
    }
    hcard = mf2util.representative_hcard(p, 'http://foo.com/bar')
    assert hcard
    assert hcard['properties']['name'][0] == 'Elliot Alderson'
예제 #18
0
    def template_vars(self, domain=None, url=None):
        logging.debug(f'Headers: {list(request.headers.items())}')

        if domain.split('.')[-1] in NON_TLDS:
            error(f"{domain} doesn't look like a domain", status=404)

        # find representative h-card. try url, then url's home page, then domain
        urls = [f'http://{domain}/']
        if url:
            urls = [url, urllib.parse.urljoin(url, '/')] + urls

        for candidate in urls:
            resp = common.requests_get(candidate)
            parsed = util.parse_html(resp)
            mf2 = util.parse_mf2(parsed, url=resp.url)
            # logging.debug(f'Parsed mf2 for {resp.url}: {json_dumps(mf2, indent=2)}')
            hcard = mf2util.representative_hcard(mf2, resp.url)
            if hcard:
                logging.info(
                    f'Representative h-card: {json_dumps(hcard, indent=2)}')
                break
        else:
            error(
                f"didn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {resp.url}"
            )

        logging.info(f'Generating WebFinger data for {domain}')
        key = models.MagicKey.get_or_create(domain)
        props = hcard.get('properties', {})
        urls = util.dedupe_urls(props.get('url', []) + [resp.url])
        canonical_url = urls[0]

        acct = f'{domain}@{domain}'
        for url in urls:
            if url.startswith('acct:'):
                urluser, urldomain = util.parse_acct_uri(url)
                if urldomain == domain:
                    acct = f'{urluser}@{domain}'
                    logging.info(f'Found custom username: acct:{acct}')
                    break

        # discover atom feed, if any
        atom = parsed.find('link',
                           rel='alternate',
                           type=common.CONTENT_TYPE_ATOM)
        if atom and atom['href']:
            atom = urllib.parse.urljoin(resp.url, atom['href'])
        else:
            atom = 'https://granary.io/url?' + urllib.parse.urlencode(
                {
                    'input': 'html',
                    'output': 'atom',
                    'url': resp.url,
                    'hub': resp.url,
                })

        # discover PuSH, if any
        for link in resp.headers.get('Link', '').split(','):
            match = common.LINK_HEADER_RE.match(link)
            if match and match.group(2) == 'hub':
                hub = match.group(1)
            else:
                hub = 'https://bridgy-fed.superfeedr.com/'

        # generate webfinger content
        data = util.trim_nulls({
            'subject':
            'acct:' + acct,
            'aliases':
            urls,
            'magic_keys': [{
                'value': key.href()
            }],
            'links':
            sum(([{
                'rel': 'http://webfinger.net/rel/profile-page',
                'type': 'text/html',
                'href': url,
            }] for url in urls if url.startswith("http")), []) +
            [{
                'rel': 'http://webfinger.net/rel/avatar',
                'href': get_text(url),
            } for url in props.get('photo', [])] + [
                {
                    'rel': 'canonical_uri',
                    'type': 'text/html',
                    'href': canonical_url,
                },

                # ActivityPub
                {
                    'rel': 'self',
                    'type': common.CONTENT_TYPE_AS2,
                    # WARNING: in python 2 sometimes request.host_url lost port,
                    # http://localhost:8080 would become just http://localhost. no
                    # clue how or why. pay attention here if that happens again.
                    'href': f'{request.host_url}{domain}',
                },
                {
                    'rel': 'inbox',
                    'type': common.CONTENT_TYPE_AS2,
                    'href': f'{request.host_url}{domain}/inbox',
                },

                # OStatus
                {
                    'rel': 'http://schemas.google.com/g/2010#updates-from',
                    'type': common.CONTENT_TYPE_ATOM,
                    'href': atom,
                },
                {
                    'rel': 'hub',
                    'href': hub,
                },
                {
                    'rel': 'magic-public-key',
                    'href': key.href(),
                },
                {
                    'rel': 'salmon',
                    'href': f'{request.host_url}{domain}/salmon',
                }
            ]
        })
        logging.info(f'Returning WebFinger data: {json_dumps(data, indent=2)}')
        return data
예제 #19
0
    def template_vars(self, domain, url=None):
        assert domain

        if domain.split('.')[-1] in NON_TLDS:
            common.error(self,
                         "%s doesn't look like a domain" % domain,
                         status=404)

        # find representative h-card. try url, then url's home page, then domain
        urls = ['http://%s/' % domain]
        if url:
            urls = [url, urlparse.urljoin(url, '/')] + urls

        for candidate in urls:
            resp = common.requests_get(candidate)
            parsed = common.beautifulsoup_parse(resp.content,
                                                from_encoding=resp.encoding)
            mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True)
            # logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))
            hcard = mf2util.representative_hcard(mf2, resp.url)
            if hcard:
                logging.info('Representative h-card: %s',
                             json.dumps(hcard, indent=2))
                break
        else:
            common.error(
                self, """\
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s"""
                % resp.url)

        logging.info('Generating WebFinger data for %s', domain)
        key = models.MagicKey.get_or_create(domain)
        props = hcard.get('properties', {})
        urls = util.dedupe_urls(props.get('url', []) + [resp.url])
        canonical_url = urls[0]

        acct = '%s@%s' % (domain, domain)
        for url in urls:
            if url.startswith('acct:'):
                urluser, urldomain = util.parse_acct_uri(url)
                if urldomain == domain:
                    acct = '%s@%s' % (urluser, domain)
                    logging.info('Found custom username: acct:%s', acct)
                    break

        # discover atom feed, if any
        atom = parsed.find('link',
                           rel='alternate',
                           type=common.CONTENT_TYPE_ATOM)
        if atom and atom['href']:
            atom = urlparse.urljoin(resp.url, atom['href'])
        else:
            atom = 'https://granary.io/url?' + urllib.urlencode(
                {
                    'input': 'html',
                    'output': 'atom',
                    'url': resp.url,
                    'hub': resp.url,
                })

        # discover PuSH, if any
        for link in resp.headers.get('Link', '').split(','):
            match = common.LINK_HEADER_RE.match(link)
            if match and match.group(2) == 'hub':
                hub = match.group(1)
            else:
                hub = 'https://bridgy-fed.superfeedr.com/'

        # generate webfinger content
        data = util.trim_nulls({
            'subject':
            'acct:' + acct,
            'aliases':
            urls,
            'magic_keys': [{
                'value': key.href()
            }],
            'links':
            sum(([{
                'rel': 'http://webfinger.net/rel/profile-page',
                'type': 'text/html',
                'href': url,
            }] for url in urls if url.startswith("http")), []) + [{
                'rel': 'http://webfinger.net/rel/avatar',
                'href': url,
            } for url in props.get('photo', [])] + [
                {
                    'rel': 'canonical_uri',
                    'type': 'text/html',
                    'href': canonical_url,
                },

                # ActivityPub
                {
                    'rel': 'self',
                    'type': 'application/activity+json',
                    # use HOST_URL instead of e.g. request.host_url because it
                    # sometimes lost port, e.g. http://localhost:8080 would become
                    # just http://localhost. no clue how or why.
                    'href': '%s/%s' % (appengine_config.HOST_URL, domain),
                },
                {
                    'rel': 'inbox',
                    'type': 'application/activity+json',
                    'href': '%s/%s/inbox' %
                    (appengine_config.HOST_URL, domain),
                },

                # OStatus
                {
                    'rel': 'http://schemas.google.com/g/2010#updates-from',
                    'type': common.CONTENT_TYPE_ATOM,
                    'href': atom,
                },
                {
                    'rel': 'hub',
                    'href': hub,
                },
                {
                    'rel': 'magic-public-key',
                    'href': key.href(),
                },
                {
                    'rel': 'salmon',
                    'href': '%s/%s/salmon' %
                    (appengine_config.HOST_URL, domain),
                }
            ]
        })
        logging.info('Returning WebFinger data: %s', json.dumps(data,
                                                                indent=2))
        return data
예제 #20
0
def generate(domain):
    try:
        resp = requests.get('http://' + domain, timeout=60, verify=False)
        resp.raise_for_status()
    except Exception as e:
        print(str(e), file=sys.stderr)
        return

    fetch_time = datetime.datetime.now()
    soup = bs4.BeautifulSoup(resp.text, 'lxml')

    # extract these from:
    # * mf2 representative h-card
    # * HTML head and meta tags
    # * Open Graph tags
    # * Twitter card tags
    # * Clearbit's Enrichment and Logo APIs
    urls = FieldSet()
    names = FieldSet()
    descriptions = FieldSet()
    pictures = FieldSet()

    mf2 = mf2py.parse(url=resp.url, doc=soup)
    hcard = mf2util.representative_hcard(mf2, resp.url)
    if hcard:
        names.update(get_texts(hcard, 'name'))
        urls.update(get_texts(hcard, 'url'))
        pictures.update(get_texts(hcard, 'photo'))
        for prop in 'note', 'label', 'description':
            descriptions.update(get_texts(hcard, prop))

    # HTML head/meta tags
    rels = mf2.get('rels', {})
    urls.update(rels.get('canonical', []))
    names.add(soup.title)
    descriptions.add_metas(soup, attrs={'name': 'description'})
    pictures.update(rels.get('icon', []))

    # Open Graph tags, http://ogp.me/
    urls.add_metas(soup, property='og:url')
    descriptions.add_metas(soup, property='og:description')
    names.add_metas(soup, property=('og:title', 'og:site_name'))
    pictures.add_metas(
        soup, property=('og:image', 'og:image:url', 'og:image:secure_url'))

    # Twitter card tags, https://dev.twitter.com/cards/overview
    urls.add_metas(soup, attrs={'name': 'twitter:url'})
    names.add_metas(soup, attrs={'name': 'twitter:title'})
    descriptions.add_metas(soup, attrs={'name': 'twitter:description'})
    pictures.add_metas(soup, attrs={'name': 'twitter:image'})

    # Clearbit:
    # https://dashboard.clearbit.com/docs#enrichment-api
    # https://logo.clearbit.com/snarfed.org
    # https://person.clearbit.com/v2/combined/find?domain=snarfed.org
    #   (needs account and oauth token)

    if not urls:
        urls = [u'http://{}/'.format(domain)]
    if not names:
        names = [domain]

    return {
        'domain': domain,
        'fetch_time': fetch_time.isoformat('T'),
        'urls': list(urls),
        'names': list(names),
        'descriptions': list(descriptions),
        'pictures': list(pictures),
        'hcard': json.dumps(hcard, sort_keys=True),
        'rel_mes': rels.get('me', []),
        'mf2': json.dumps(mf2, sort_keys=True),
        'html': resp.text,
    }