Exemplo n.º 1
0
def html_to_atom(html, url=None, fetch_author=False, reader=True):
  """Converts microformats2 HTML to an Atom feed.

  Args:
    html: unicode string
    url: string URL html came from, optional
    fetch_author: boolean, whether to make HTTP request to fetch rel-author link
    reader: boolean, whether the output will be rendered in a feed reader.
      Currently just includes location if True, not otherwise.

  Returns:
    unicode string with Atom XML
  """
  if fetch_author:
    assert url, 'fetch_author=True requires url!'

  parsed = util.parse_mf2(html, url=url)
  actor = microformats2.find_author(parsed, fetch_mf2_func=util.fetch_mf2)

  return activities_to_atom(
    microformats2.html_to_activities(html, url, actor),
    actor,
    title=microformats2.get_title(parsed),
    xml_base=util.base_url(url),
    host_url=url,
    reader=reader)
Exemplo n.º 2
0
def html_to_activities(html, url=None, actor=None, id=None):
  """Converts a microformats2 HTML h-feed to ActivityStreams activities.

  Args:
    html: unicode string HTML or :class:`requests.Response`
    url: optional string URL that HTML came from
    actor: optional author AS actor object for all activities. usually comes
      from a rel="author" link.
    id: string, optional id of specific element to extract and parse. defaults
      to the whole page.

  Returns:
    list of ActivityStreams activity dicts
  """
  parsed = util.parse_mf2(html, url=url, id=id)
  hfeed = mf2util.find_first_entry(parsed, ['h-feed'])
  items = hfeed.get('children', []) if hfeed else parsed.get('items', [])

  activities = []
  for item in items:
    types = item.get('type', [])
    if 'h-entry' in types or 'h-event' in types or 'h-cite' in types:
      obj = json_to_object(item, actor=actor)
      obj['content_is_html'] = True
      activities.append({'object': obj})

  return activities
Exemplo n.º 3
0
    def post(self):
        logging.info('Params: %s', list(self.request.params.items()))

        # fetch source page
        source = util.get_required_param(self, 'source')
        source_resp = common.requests_get(source)
        self.source_url = source_resp.url or source
        self.source_domain = urllib.parse.urlparse(self.source_url).netloc.split(':')[0]
        self.source_mf2 = util.parse_mf2(source_resp)

        # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json_dumps(self.source_mf2 indent=2))

        # check for backlink to bridgy fed (for webmention spec and to confirm
        # source's intent to federate to mastodon)
        if (self.request.host_url not in source_resp.text and
            urllib.parse.quote(self.request.host_url, safe='') not in source_resp.text):
            self.error("Couldn't find link to %s" % self.request.host_url)

        # convert source page to ActivityStreams
        entry = mf2util.find_first_entry(self.source_mf2, ['h-entry'])
        if not entry:
            self.error('No microformats2 found on %s' % self.source_url)

        logging.info('First entry: %s', json_dumps(entry, indent=2))
        # make sure it has url, since we use that for AS2 id, which is required
        # for ActivityPub.
        props = entry.setdefault('properties', {})
        if not props.get('url'):
            props['url'] = [self.source_url]

        self.source_obj = microformats2.json_to_object(entry, fetch_mf2=True)
        logging.info('Converted to AS1: %s', json_dumps(self.source_obj, indent=2))

        self.try_activitypub() or self.try_salmon()
Exemplo n.º 4
0
    def dispatch_request(self):
        logging.info(f'Params: {list(request.form.items())}')

        # fetch source page
        source = flask_util.get_required_param('source')
        source_resp = common.requests_get(source)
        self.source_url = source_resp.url or source
        self.source_domain = urllib.parse.urlparse(
            self.source_url).netloc.split(':')[0]
        self.source_mf2 = util.parse_mf2(source_resp)

        # logging.debug(f'Parsed mf2 for {source_resp.url} : {json_dumps(self.source_mf2 indent=2)}')

        # check for backlink to bridgy fed (for webmention spec and to confirm
        # source's intent to federate to mastodon)
        if (request.host_url not in source_resp.text and urllib.parse.quote(
                request.host_url, safe='') not in source_resp.text):
            error("Couldn't find link to {request.host_url}")

        # convert source page to ActivityStreams
        entry = mf2util.find_first_entry(self.source_mf2, ['h-entry'])
        if not entry:
            error(f'No microformats2 found on {self.source_url}')

        logging.info(f'First entry: {json_dumps(entry, indent=2)}')
        # make sure it has url, since we use that for AS2 id, which is required
        # for ActivityPub.
        props = entry.setdefault('properties', {})
        if not props.get('url'):
            props['url'] = [self.source_url]

        self.source_obj = microformats2.json_to_object(entry, fetch_mf2=True)
        logging.info(
            f'Converted to AS1: {json_dumps(self.source_obj, indent=2)}')

        for method in self.try_activitypub, self.try_salmon:
            ret = method()
            if ret:
                return ret

        return ''
Exemplo n.º 5
0
    def get(self):
        input = util.get_required_param(self, 'input')
        if input not in INPUTS:
            raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' %
                                     (input, INPUTS))

        orig_url = util.get_required_param(self, 'url')
        fragment = urllib.parse.urlparse(orig_url).fragment
        if fragment and input != 'html':
            raise exc.HTTPBadRequest(
                'URL fragments only supported with input=html.')

        resp = util.requests_get(orig_url, gateway=True)
        final_url = resp.url

        # decode data
        if input in ('activitystreams', 'as1', 'as2', 'mf2-json', 'json-mf2',
                     'jsonfeed'):
            try:
                body_json = json_loads(resp.text)
                body_items = (body_json if isinstance(body_json, list) else
                              body_json.get('items') or [body_json])
            except (TypeError, ValueError):
                raise exc.HTTPBadRequest('Could not decode %s as JSON' %
                                         final_url)

        mf2 = None
        if input == 'html':
            mf2 = util.parse_mf2(resp, id=fragment)
            if id and not mf2:
                raise exc.HTTPBadRequest(
                    'Got fragment %s but no element found with that id.' %
                    fragment)
        elif input in ('mf2-json', 'json-mf2'):
            mf2 = body_json
            if not hasattr(mf2, 'get'):
                raise exc.HTTPBadRequest(
                    'Expected microformats2 JSON input to be dict, got %s' %
                    mf2.__class__.__name__)
            mf2.setdefault('rels', {})  # mf2util expects rels

        actor = None
        title = None
        hfeed = None
        if mf2:

            def fetch_mf2_func(url):
                if util.domain_or_parent_in(
                        urllib.parse.urlparse(url).netloc, SILO_DOMAINS):
                    return {
                        'items': [{
                            'type': ['h-card'],
                            'properties': {
                                'url': [url]
                            }
                        }]
                    }
                return util.fetch_mf2(url, gateway=True)

            try:
                actor = microformats2.find_author(
                    mf2, fetch_mf2_func=fetch_mf2_func)
                title = microformats2.get_title(mf2)
                hfeed = mf2util.find_first_entry(mf2, ['h-feed'])
            except (KeyError, ValueError) as e:
                raise exc.HTTPBadRequest('Could not parse %s as %s: %s' %
                                         (final_url, input, e))

        try:
            if input in ('as1', 'activitystreams'):
                activities = body_items
            elif input == 'as2':
                activities = [as2.to_as1(obj) for obj in body_items]
            elif input == 'atom':
                try:
                    activities = atom.atom_to_activities(resp.text)
                except ElementTree.ParseError as e:
                    raise exc.HTTPBadRequest('Could not parse %s as XML: %s' %
                                             (final_url, e))
                except ValueError as e:
                    raise exc.HTTPBadRequest('Could not parse %s as Atom: %s' %
                                             (final_url, e))
            elif input == 'html':
                activities = microformats2.html_to_activities(resp,
                                                              url=final_url,
                                                              id=fragment,
                                                              actor=actor)
            elif input in ('mf2-json', 'json-mf2'):
                activities = [
                    microformats2.json_to_object(item, actor=actor)
                    for item in mf2.get('items', [])
                ]
            elif input == 'jsonfeed':
                activities, actor = jsonfeed.jsonfeed_to_activities(body_json)
        except ValueError as e:
            logging.warning('parsing input failed', stack_info=True)
            self.abort(
                400,
                'Could not parse %s as %s: %s' % (final_url, input, str(e)))

        self.write_response(
            source.Source.make_activities_base_response(activities),
            url=final_url,
            actor=actor,
            title=title,
            hfeed=hfeed)
Exemplo n.º 6
0
    def setUp(self):
        super(WebmentionTest, self).setUp()
        self.key = MagicKey.get_or_create('a')

        self.orig_html_as2 = requests_response("""\
<html>
<meta>
<link href='http://orig/atom' rel='alternate' type='application/atom+xml'>
<link href='http://orig/as2' rel='alternate' type='application/activity+json'>
</meta>
</html>
""",
                                               url='http://orig/post',
                                               content_type=CONTENT_TYPE_HTML)
        self.orig_html_atom = requests_response("""\
<html>
<meta>
<link href='http://orig/atom' rel='alternate' type='application/atom+xml'>
</meta>
</html>
""",
                                                url='http://orig/post',
                                                content_type=CONTENT_TYPE_HTML)
        self.orig_atom = requests_response("""\
<?xml version="1.0"?>
<entry xmlns="http://www.w3.org/2005/Atom">
  <id>tag:fed.brid.gy,2017-08-22:orig-post</id>
  <link rel="salmon" href="http://orig/salmon"/>
  <content type="html">baz ☕ baj</content>
</entry>
""",
                                           content_type=CONTENT_TYPE_ATOM)
        self.orig_as2_data = {
            '@context': ['https://www.w3.org/ns/activitystreams'],
            'type': 'Article',
            'id': 'tag:orig,2017:as2',
            'content': 'Lots of ☕ words...',
            'actor': {
                'url': 'http://orig/author'
            },
            'to': ['http://orig/recipient'],
            'cc': ['http://orig/bystander', AS2_PUBLIC_AUDIENCE],
        }
        self.orig_as2 = requests_response(self.orig_as2_data,
                                          url='http://orig/as2',
                                          content_type=CONTENT_TYPE_AS2 +
                                          '; charset=utf-8')

        self.reply_html = """\
<html>
<body>
<div class="h-entry">
<a class="u-url" href="http://a/reply"></a>
<p class="e-content p-name">
<a class="u-in-reply-to" href="http://not/fediverse"></a>
<a class="u-in-reply-to" href="http://orig/post">foo ☕ bar</a>
<a href="http://localhost/"></a>
</p>
<a class="p-author h-card" href="http://orig">Ms. ☕ Baz</a>
</div>
</body>
</html>
"""
        self.reply = requests_response(self.reply_html,
                                       content_type=CONTENT_TYPE_HTML)
        self.reply_mf2 = util.parse_mf2(self.reply_html, url='http://a/reply')

        self.repost_html = REPOST_HTML
        self.repost = requests_response(self.repost_html,
                                        content_type=CONTENT_TYPE_HTML)
        self.repost_mf2 = util.parse_mf2(self.repost_html,
                                         url='http://a/repost')
        self.repost_as2 = REPOST_AS2

        self.like_html = """\
<html>
<body class="h-entry">
<a class="u-url" href="http://a/like"></a>
<a class="u-like-of" href="http://orig/post"></a>
<!--<a class="u-like-of p-name" href="http://orig/post">liked!</a>-->
<a class="p-author h-card" href="http://orig">Ms. ☕ Baz</a>
<a href="http://localhost/"></a>
</body>
</html>
"""
        self.like = requests_response(self.like_html,
                                      content_type=CONTENT_TYPE_HTML)
        self.like_mf2 = util.parse_mf2(self.like_html, url='http://a/like')

        self.actor = requests_response(
            {
                'objectType': 'person',
                'displayName': 'Mrs. ☕ Foo',
                'url': 'https://foo.com/about-me',
                'inbox': 'https://foo.com/inbox',
            },
            content_type=CONTENT_TYPE_AS2)

        self.as2_create = {
            '@context': 'https://www.w3.org/ns/activitystreams',
            'type': 'Create',
            'object': {
                '@context':
                'https://www.w3.org/ns/activitystreams',
                'type':
                'Note',
                'id':
                'http://localhost/r/http://a/reply',
                'url':
                'http://localhost/r/http://a/reply',
                'name':
                'foo ☕ bar',
                'content':
                """\
<a class="u-in-reply-to" href="http://not/fediverse"></a>
<a class="u-in-reply-to" href="http://orig/post">foo ☕ bar</a>
<a href="http://localhost/"></a>""",
                'inReplyTo':
                'tag:orig,2017:as2',
                'cc': [
                    AS2_PUBLIC_AUDIENCE,
                    'http://orig/author',
                    'http://orig/recipient',
                    'http://orig/bystander',
                ],
                'attributedTo': [{
                    'type': 'Person',
                    'id': 'http://localhost/orig',
                    'url': 'http://localhost/r/http://orig',
                    'preferredUsername': '******',
                    'name': 'Ms. ☕ Baz',
                }],
                'tag': [{
                    'type': 'Mention',
                    'href': 'http://orig/author',
                }],
            },
        }
        self.as2_update = copy.deepcopy(self.as2_create)
        self.as2_update['type'] = 'Update'

        self.follow_html = """\
<html>
<body class="h-entry">
<a class="u-url" href="http://a/follow"></a>
<a class="u-follow-of" href="http://followee"></a>
<a class="p-author h-card" href="https://orig">Ms. ☕ Baz</a>
<a href="http://localhost/"></a>
</body>
</html>
"""
        self.follow = requests_response(self.follow_html,
                                        content_type=CONTENT_TYPE_HTML)
        self.follow_mf2 = util.parse_mf2(self.follow_html,
                                         url='http://a/follow')
        self.follow_as2 = {
            '@context': 'https://www.w3.org/ns/activitystreams',
            'type': 'Follow',
            'id': 'http://localhost/r/http://a/follow',
            'url': 'http://localhost/r/http://a/follow',
            'object': 'http://followee',
            'actor': {
                'id': 'http://localhost/orig',
                'name': 'Ms. ☕ Baz',
                'preferredUsername': '******',
                'type': 'Person',
                'url': 'http://localhost/r/https://orig',
            },
            'cc': ['https://www.w3.org/ns/activitystreams#Public'],
        }

        self.create_html = """\
<html>
<body class="h-entry">
<a class="u-url" href="http://orig/post"></a>
<p class="e-content p-name">hello i am a post</p>
<a class="p-author h-card" href="https://orig">Ms. ☕ Baz</a>
<a href="http://localhost/"></a>
</body>
</html>
"""
        self.create = requests_response(self.create_html,
                                        content_type=CONTENT_TYPE_HTML)
        self.create_mf2 = util.parse_mf2(self.create_html,
                                         url='http://a/create')
        self.create_as2 = {
            '@context': 'https://www.w3.org/ns/activitystreams',
            'type': 'Create',
            'object': {
                '@context':
                'https://www.w3.org/ns/activitystreams',
                'type':
                'Note',
                'id':
                'http://localhost/r/http://orig/post',
                'url':
                'http://localhost/r/http://orig/post',
                'name':
                'hello i am a post',
                'content':
                'hello i am a post',
                'attributedTo': [{
                    'type': 'Person',
                    'id': 'http://localhost/orig',
                    'url': 'http://localhost/r/https://orig',
                    'name': 'Ms. ☕ Baz',
                    'preferredUsername': '******',
                }],
                'cc': ['https://www.w3.org/ns/activitystreams#Public'],
            },
        }

        self.not_fediverse = requests_response("""\
<html>
<body>foo</body>
</html>
""",
                                               url='http://not/fediverse',
                                               content_type=CONTENT_TYPE_HTML)
        self.activitypub_gets = [
            self.reply, self.not_fediverse, self.orig_as2, self.actor
        ]
Exemplo n.º 7
0
    def template_vars(self, domain, url=None):
        assert domain

        if domain.split('.')[-1] in NON_TLDS:
            self.error("%s doesn't look like a domain" % domain, status=404)

        # find representative h-card. try url, then url's home page, then domain
        urls = ['http://%s/' % domain]
        if url:
            urls = [url, urllib.parse.urljoin(url, '/')] + urls

        for candidate in urls:
            resp = common.requests_get(candidate)
            parsed = util.parse_html(resp)
            mf2 = util.parse_mf2(parsed, url=resp.url)
            # logging.debug('Parsed mf2 for %s: %s', resp.url, json_dumps(mf2, indent=2))
            hcard = mf2util.representative_hcard(mf2, resp.url)
            if hcard:
                logging.info('Representative h-card: %s',
                             json_dumps(hcard, indent=2))
                break
        else:
            self.error("""\
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s"""
                       % resp.url)

        logging.info('Generating WebFinger data for %s', domain)
        key = models.MagicKey.get_or_create(domain)
        props = hcard.get('properties', {})
        urls = util.dedupe_urls(props.get('url', []) + [resp.url])
        canonical_url = urls[0]

        acct = '%s@%s' % (domain, domain)
        for url in urls:
            if url.startswith('acct:'):
                urluser, urldomain = util.parse_acct_uri(url)
                if urldomain == domain:
                    acct = '%s@%s' % (urluser, domain)
                    logging.info('Found custom username: acct:%s', acct)
                    break

        # discover atom feed, if any
        atom = parsed.find('link',
                           rel='alternate',
                           type=common.CONTENT_TYPE_ATOM)
        if atom and atom['href']:
            atom = urllib.parse.urljoin(resp.url, atom['href'])
        else:
            atom = 'https://granary.io/url?' + urllib.parse.urlencode(
                {
                    'input': 'html',
                    'output': 'atom',
                    'url': resp.url,
                    'hub': resp.url,
                })

        # discover PuSH, if any
        for link in resp.headers.get('Link', '').split(','):
            match = common.LINK_HEADER_RE.match(link)
            if match and match.group(2) == 'hub':
                hub = match.group(1)
            else:
                hub = 'https://bridgy-fed.superfeedr.com/'

        # generate webfinger content
        data = util.trim_nulls({
            'subject':
            'acct:' + acct,
            'aliases':
            urls,
            'magic_keys': [{
                'value': key.href()
            }],
            'links':
            sum(([{
                'rel': 'http://webfinger.net/rel/profile-page',
                'type': 'text/html',
                'href': url,
            }] for url in urls if url.startswith("http")), []) + [{
                'rel': 'http://webfinger.net/rel/avatar',
                'href': url,
            } for url in props.get('photo', [])] + [
                {
                    'rel': 'canonical_uri',
                    'type': 'text/html',
                    'href': canonical_url,
                },

                # ActivityPub
                {
                    'rel': 'self',
                    'type': common.CONTENT_TYPE_AS2,
                    # WARNING: in python 2 sometimes request.host_url lost port,
                    # http://localhost:8080 would become just http://localhost. no
                    # clue how or why. pay attention here if that happens again.
                    'href': '%s/%s' % (self.request.host_url, domain),
                },
                {
                    'rel': 'inbox',
                    'type': common.CONTENT_TYPE_AS2,
                    'href': '%s/%s/inbox' % (self.request.host_url, domain),
                },

                # OStatus
                {
                    'rel': 'http://schemas.google.com/g/2010#updates-from',
                    'type': common.CONTENT_TYPE_ATOM,
                    'href': atom,
                },
                {
                    'rel': 'hub',
                    'href': hub,
                },
                {
                    'rel': 'magic-public-key',
                    'href': key.href(),
                },
                {
                    'rel': 'salmon',
                    'href': '%s/%s/salmon' % (self.request.host_url, domain),
                }
            ]
        })
        logging.info('Returning WebFinger data: %s', json_dumps(data,
                                                                indent=2))
        return data