Python extract_links示例，oauth_dropins.webutil.util.extract_links Python示例

示例#1

0

显示文件

  def user_to_actor(self, user):
    """Converts a user or page to an actor.

    Args:
      user: dict, a decoded JSON Facebook user or page

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    if not user:
      return {}

    id = user.get('id')
    username = user.get('username')
    handle = username or id
    if not handle:
      return {}

    # facebook implements this as a 302 redirect
    actor = {
      # FB only returns the type field if you fetch the object with ?metadata=1
      # https://developers.facebook.com/docs/graph-api/using-graph-api/v2.2#introspection
      'objectType': 'page' if user.get('type') == 'page' else 'person',
      'displayName': user.get('name') or username,
      'id': self.tag_uri(handle),
      'updated': util.maybe_iso8601_to_rfc3339(user.get('updated_time')),
      'username': username,
      'description': user.get('bio') or user.get('description'),
      'summary': user.get('about'),
      }

    # numeric_id is our own custom field that always has the source's numeric
    # user id, if available.
    if util.is_int(id):
      actor.update({
        'numeric_id': id,
        'image': {
          'url': 'https://graph.facebook.com/v2.2/%s/picture?type=large' % id,
        },
      })

    # extract web site links. extract_links uniquifies and preserves order
    urls = util.extract_links(user.get('website'))
    if not urls:
      urls = util.extract_links(user.get('link')) or [self.user_url(handle)]
    actor['url'] = urls[0]
    if len(urls) > 1:
      actor['urls'] = [{'value': u} for u in urls]

    location = user.get('location')
    if location:
      actor['location'] = {'id': location.get('id'),
                           'displayName': location.get('name')}

    return util.trim_nulls(actor)

示例#2

0

显示文件

文件： facebook.py 项目： harixxy/granary

  def user_to_actor(self, user):
    """Converts a user or page to an actor.

    Args:
      user: dict, a decoded JSON Facebook user or page

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    if not user:
      return {}

    id = user.get('id')
    username = user.get('username')
    handle = username or id
    if not handle:
      return {}

    # facebook implements this as a 302 redirect
    actor = {
      # FB only returns the type field if you fetch the object with ?metadata=1
      # https://developers.facebook.com/docs/graph-api/using-graph-api/v2.2#introspection
      'objectType': 'page' if user.get('type') == 'page' else 'person',
      'displayName': user.get('name') or username,
      'id': self.tag_uri(handle),
      'updated': util.maybe_iso8601_to_rfc3339(user.get('updated_time')),
      'username': username,
      'description': user.get('bio') or user.get('description'),
      'summary': user.get('about'),
      }

    # numeric_id is our own custom field that always has the source's numeric
    # user id, if available.
    if util.is_int(id):
      actor.update({
        'numeric_id': id,
        'image': {
          'url': 'https://graph.facebook.com/v2.2/%s/picture?type=large' % id,
        },
      })

    # extract web site links. extract_links uniquifies and preserves order
    urls = util.extract_links(user.get('website'))
    if not urls:
      urls = util.extract_links(user.get('link')) or [self.user_url(handle)]
    actor['url'] = urls[0]
    if len(urls) > 1:
      actor['urls'] = [{'value': u} for u in urls]

    location = user.get('location')
    if location:
      actor['location'] = {'id': location.get('id'),
                           'displayName': location.get('name')}

    return util.trim_nulls(actor)

示例#3

0

显示文件

文件： source.py 项目： Yingmin-Li/activitystreams-unofficial

  def original_post_discovery(activity):
    """Discovers original post links and stores them as tags, in place.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it.

    Args:
      activity: activity dict
    """
    obj = activity.get('object') or activity
    content = obj.get('content', '').strip()

    # Permashortcitations are short references to canonical copies of a given
    # (usually syndicated) post, of the form (DOMAIN PATH). Details:
    # http://indiewebcamp.com/permashortcitation
    pscs =  set(match.expand(r'http://\1/\2')
                for match in Source._PERMASHORTCITATION_RE.finditer(content))

    attachments = set(a.get('url') for a in obj.get('attachments', [])
                      if a['objectType'] == 'article')
    urls = util.trim_nulls(util.extract_links(content) | attachments | pscs)
    obj.setdefault('tags', []).extend({'objectType': 'article', 'url': u}
                                      for u in urls)

    return activity

示例#4

0

显示文件

    def user_to_actor(self, user):
        """Converts a GitHub user to an ActivityStreams actor.

    Handles both v4 GraphQL and v3 REST API user objects.

    https://developer.github.com/v4/object/user/
    https://developer.github.com/v3/users/

    Args:
      user: dict, decoded JSON GitHub user

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
        actor = self._to_object(user)
        if not actor:
            return actor

        username = user.get('login')
        desc = user.get('bio') or user.get('description')

        actor.update({
            # TODO: orgs, bots
            'objectType': 'person',
            'displayName': user.get('name') or username,
            'username': username,
            'email': user.get('email'),
            'description': desc,
            'summary': desc,
            'image': {
                'url':
                user.get('avatarUrl') or user.get('avatar_url')
                or user.get('url')
            },
            'location': {
                'displayName': user.get('location')
            },
        })

        # extract web site links. extract_links uniquifies and preserves order
        urls = sum(
            (
                util.extract_links(user.get(field)) for field in (
                    'html_url',  # REST
                    'url',  # both
                    'websiteUrl',  # GraphQL
                    'blog',  # REST
                    'bio',  # both
                )),
            [])
        urls = [
            u for u in urls if util.domain_from_link(u) != 'api.github.com'
        ]
        if urls:
            actor['url'] = urls[0]
            if len(urls) > 1:
                actor['urls'] = [{'value': u} for u in urls]

        return self.postprocess_object(actor)

示例#5

0

显示文件

  def praw_to_object(self, thing, type):
    """
    Converts a praw object to an object. currently only returns public content
    Note that this will make external API calls to lazily load some attrs

    Args:
      thing: a praw object, Submission or Comment
      type: string to denote whether to get submission or comment content

    Returns:
      an ActivityStreams object dict, ready to be JSON-encoded
      """
    obj = {}

    id = getattr(thing, 'id', None)
    if not id:
      return {}

    published = util.maybe_timestamp_to_iso8601(getattr(thing, 'created_utc', None))

    obj = {
      'id': self.tag_uri(id),
      'published': published,
      'to': [{
        'objectType': 'group',
        'alias': '@public',
        }],
      }

    user = getattr(thing, 'author', None)
    if user:
      obj['author'] = self.praw_to_actor(user)
      username = obj['author'].get('username')

    obj['url'] = self.BASE_URL + thing.permalink

    if type == 'submission':
      obj['content'] = getattr(thing, 'title', None)
      obj['objectType'] = 'note'
      obj['tags'] = [
          {'objectType': 'article',
           'url': t,
           'displayName': t,
           } for t in util.extract_links(getattr(thing, 'selftext', None))
        ]
    elif type == 'comment':
      obj['content'] = getattr(thing, 'body_html', None)
      obj['objectType'] = 'comment'
      reply_to = thing.parent()
      if reply_to:
        obj['inReplyTo'] = [{
          'id': self.tag_uri(getattr(reply_to, 'id', None)),
          'url': self.BASE_URL + getattr(reply_to, 'permalink', None),
          }]

    return self.postprocess_object(obj)

示例#6

0

显示文件

文件： reddit.py 项目： debanjum/granary

    def praw_to_object(self, thing, type):
        """
    Converts a praw object to an object. currently only returns public content

    Args:
      thing: a praw object, Submission or Comment
      type: string to denote whether to get submission or comment content

    Returns:
      an ActivityStreams object dict, ready to be JSON-encoded
      """
        obj = {}

        id = thing.id
        if not id:
            return {}

        published = util.maybe_timestamp_to_iso8601(thing.created_utc)

        obj = {
            'id': self.tag_uri(id),
            'published': published,
            'to': [{
                'objectType': 'group',
                'alias': '@public',
            }],
        }

        user = thing.author
        if user:
            obj['author'] = self.praw_to_actor(user)
            username = obj['author'].get('username')

        obj['url'] = self.BASE_URL + thing.permalink

        if type == 'submission':
            obj['content'] = thing.title
            obj['objectType'] = 'note'
            obj['tags'] = [{
                'objectType': 'article',
                'url': t,
                'displayName': t,
            } for t in util.extract_links(thing.selftext)]
        elif type == 'comment':
            obj['content'] = thing.body
            obj['objectType'] = 'comment'
            reply_to = thing.parent()
            if reply_to:
                obj['inReplyTo'] = [{
                    'id': self.tag_uri(reply_to.id),
                    'url': self.BASE_URL + reply_to.permalink,
                }]

        return self.postprocess_object(obj)

示例#7

0

显示文件

文件： reddit.py 项目： debanjum/granary

    def user_to_actor(self, user):
        """Converts a dict user to an actor.

    Args:
      user: json user

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
        username = user.get('name')
        if not username:
            return {}

        # trying my best to grab all the urls from the profile description

        description = ''
        subreddit = user.get('subreddit')
        if subreddit:
            user_url = self.BASE_URL + subreddit.get('url')
            urls = [user_url]
            description = subreddit.get('public_description')
            profile_urls = util.extract_links(description)
            urls += util.trim_nulls(profile_urls)
        else:
            urls = [self.BASE_URL + '/user/' + username]

        image = user.get('icon_img')

        return util.trim_nulls({
            'objectType':
            'person',
            'displayName':
            username,
            'image': {
                'url': image
            },
            'id':
            self.tag_uri(username),
            # numeric_id is our own custom field that always has the source's numeric
            # user id, if available.
            'numeric_id':
            user.get('id'),
            'published':
            util.maybe_timestamp_to_iso8601(user.get('created_utc')),
            'url':
            urls[0],
            'urls': [{
                'value': u
            } for u in urls] if len(urls) > 1 else None,
            'username':
            username,
            'description':
            description,
        })

示例#8

0

显示文件

    def user_to_actor(self, user):
        """Converts a user to an actor.

    Args:
      user: JSON object from the Instagram API

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
        if not user:
            return {}

        id = user.get('id')
        username = user.get('username')
        actor = {
            'id': self.tag_uri(id or username),
            'username': username,
            'objectType': 'person',
        }
        if not id or not username:
            return actor

        urls = [self.user_url(username)] + sum(
            (util.extract_links(user.get(field))
             for field in ('website', 'bio')), [])
        actor.update({
            'url':
            urls[0],
            'urls': [{
                'value': u
            } for u in urls] if len(urls) > 1 else None
        })

        private = user.get('is_private')
        if private is not None:
            actor['to'] = [{
                'objectType': 'group',
                'alias': '@private' if private else '@public',
            }]

        pic_url = user.get('profile_picture') or user.get(
            'profile_pic_url') or ''

        actor.update({
            'displayName': user.get('full_name') or username,
            'image': {
                'url': pic_url.replace('\/', '/')
            },
            'description': user.get('bio')
        })

        return util.trim_nulls(actor)

示例#9

0

显示文件

文件： mastodon.py 项目： whyouare111/granary

  def user_to_actor(self, account):
    """Converts a Mastodon account to an AS1 actor.

    Args:
      account: dict, Mastodon account

    Returns: dict, AS1 actor
    """
    domain = self.DOMAIN
    username = account.get('username')

    # parse acct. it's just username for local accounts but fully qualified
    # address for remote accounts, eg [email protected].
    acct = account.get('acct') or ''
    split = acct.split('@')
    if len(split) in (2, 3):
      acct_username, acct_domain = split[-2:]
      if acct_domain:
        domain = acct_domain
      if not username:
        username = acct[-2]
      elif acct_username and username != acct_username:
        raise ValueError('username %s and acct %s conflict!' % (username, acct))

    if not username:
      return {}

    url = account.get('url')
    # mastodon's 'Web site' fields are HTML links, so extract their URLs
    web_sites = sum((util.extract_links(f.get('value'))
                     for f in (account.get('fields') or [])), [])

    # account.created_at is string ISO8601 in Mastodon, int timestamp in Pixelfed
    published = account.get('created_at')
    if util.is_int(published) or util.is_float(published):
      published = util.maybe_timestamp_to_iso8601(published)

    return util.trim_nulls({
      'objectType': 'person',
      'id': util.tag_uri(domain, username),
      'numeric_id': account.get('id'),
      'username': username,
      'displayName': account.get('display_name') or acct or username,
      'url': url,
      'urls': [{'value': u} for u in [url] + web_sites],
      'image': {'url': account.get('avatar')},
      'published': published,
      'description': account.get('note'),
    })

示例#10

0

显示文件

文件： github.py 项目： snarfed/granary

  def user_to_actor(cls, user):
    """Converts a GitHub user to an ActivityStreams actor.

    Handles both v4 GraphQL and v3 REST API user objects.

    https://developer.github.com/v4/object/user/
    https://developer.github.com/v3/users/

    Args:
      user: dict, decoded JSON GitHub user

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    actor = cls._to_object(user)
    if not actor:
      return actor

    username = user.get('login')
    desc = user.get('bio') or user.get('description')

    actor.update({
      # TODO: orgs, bots
      'objectType': 'person',
      'displayName': user.get('name') or username,
      'username': username,
      'email': user.get('email'),
      'description': desc,
      'summary': desc,
      'image': {'url': user.get('avatarUrl') or user.get('avatar_url') or user.get('url')},
      'location': {'displayName': user.get('location')},
    })

    # extract web site links. extract_links uniquifies and preserves order
    urls = sum((util.extract_links(user.get(field)) for field in (
      'html_url',  # REST
      'url',  # both
      'websiteUrl',  # GraphQL
      'blog',  # REST
      'bio',   # both
    )), [])
    urls = [u for u in urls if util.domain_from_link(u) != 'api.github.com']
    if urls:
      actor['url'] = urls[0]
      if len(urls) > 1:
        actor['urls'] = [{'value': u} for u in urls]

    return cls.postprocess_object(actor)

示例#11

0

显示文件

  def original_post_discovery(activity):
    """Discovers original post links and stores them as tags, in place.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it.

    Args:
      activity: activity dict
    """
    obj = activity.get('object') or activity
    content = obj.get('content', '').strip()

    def article_urls(field):
      return set(util.trim_nulls(a.get('url') for a in obj.get(field, [])
                                 if a.get('objectType') == 'article'))
    attachments = article_urls('attachments')
    tags = article_urls('tags')
    urls = attachments | set(util.extract_links(content))

    # Permashortcitations are short references to canonical copies of a given
    # (usually syndicated) post, of the form (DOMAIN PATH). Details:
    # http://indiewebcamp.com/permashortcitation
    #
    # We consider them an explicit original post link, so we store them in
    # upstreamDuplicates to signal that.
    # http://activitystrea.ms/specs/json/1.0/#id-comparison
    for match in Source._PERMASHORTCITATION_RE.finditer(content):
      http = match.expand(r'http://\1/\2')
      https = match.expand(r'https://\1/\2')
      uds = obj.setdefault('upstreamDuplicates', [])
      if (http not in uds and https not in uds
          # heuristic: ellipsized URLs are probably incomplete, so omit them.
          and not http.endswith('...') and not http.endswith(u'…')):
        uds.append(http)

    obj.setdefault('tags', []).extend(
      {'objectType': 'article', 'url': u} for u in urls
      # same heuristic from above
      if not u.endswith('...') and not u.endswith(u'…'))
    return activity

示例#12

0

显示文件

文件： source.py 项目： barnabywalters/activitystreams-unofficial

  def original_post_discovery(activity):
    """Discovers original post links and stores them as tags, in place.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it.

    Args:
      activity: activity dict
    """
    obj = activity.get('object') or activity
    content = obj.get('content', '').strip()

    def article_urls(field):
      return set(util.trim_nulls(a.get('url') for a in obj.get(field, [])
                                 if a.get('objectType') == 'article'))
    attachments = article_urls('attachments')
    tags = article_urls('tags')
    urls = attachments | set(util.extract_links(content))

    # Permashortcitations are short references to canonical copies of a given
    # (usually syndicated) post, of the form (DOMAIN PATH). Details:
    # http://indiewebcamp.com/permashortcitation
    #
    # We consider them an explicit original post link, so we store them in
    # upstreamDuplicates to signal that.
    # http://activitystrea.ms/specs/json/1.0/#id-comparison
    for match in Source._PERMASHORTCITATION_RE.finditer(content):
      http = match.expand(r'http://\1/\2')
      https = match.expand(r'https://\1/\2')
      uds = obj.setdefault('upstreamDuplicates', [])
      if http not in uds and https not in uds:
        uds.append(http)

    obj.setdefault('tags', []).extend(
      {'objectType': 'article', 'url': u} for u in urls
      # heuristic: ellipsized URLs are probably incomplete, so omit them.
      if not u.endswith('...') and not u.endswith(u'…'))
    return activity

示例#13

0

显示文件

    def user_to_actor(self, user):
        """Converts a user to an actor.

    Args:
      user: JSON object from the Instagram API

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
        if not user:
            return {}

        id = user.get('id')
        username = user.get('username')
        actor = {
            'id': self.tag_uri(id or username),
            'username': username,
            'objectType': 'person',
        }
        if not id or not username:
            return actor

        urls = sum((util.extract_links(user.get(field))
                    for field in ('website', 'bio')), [])
        if urls:
            actor['url'] = urls[0]
            if len(urls) > 1:
                actor['urls'] = [{'value': u} for u in urls]
        else:
            actor['url'] = self.user_url(username)

        actor.update({
            'displayName': user.get('full_name') or username,
            'image': {
                'url': user.get('profile_picture')
            },
            'description': user.get('bio')
        })

        return util.trim_nulls(actor)

示例#14

0

显示文件

文件： instagram.py 项目： kylewm/granary

  def user_to_actor(self, user):
    """Converts a user to an actor.

    Args:
      user: JSON object from the Instagram API

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    if not user:
      return {}

    id = user.get('id')
    username = user.get('username')
    actor = {
      'id': self.tag_uri(id or username),
      'username': username,
    }
    if not id or not username:
      return actor

    urls = sum((util.extract_links(user.get(field)) for field in ('website', 'bio')),
               [])
    if urls:
      actor['url'] = urls[0]
      if len(urls) > 1:
        actor['urls'] = [{'value': u} for u in urls]
    else:
      actor['url'] = self.user_url(username)

    actor.update({
      'objectType': 'person',
      'displayName': user.get('full_name') or username,
      'image': {'url': user.get('profile_picture')},
      'description': user.get('bio')
    })

    return util.trim_nulls(actor)

示例#15

0

显示文件

文件： source.py 项目： ALIrezanouri/granary

    def original_post_discovery(activity,
                                domains=None,
                                cache=None,
                                include_redirect_sources=True,
                                **kwargs):
        """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns:
      ([string original post URLs], [string mention URLs]) tuple
    """
        obj = activity.get('object') or activity
        content = obj.get('content', '').strip()

        # find all candidate URLs
        tags = [
            t.get('url')
            for t in obj.get('attachments', []) + obj.get('tags', [])
            if t.get('objectType') in ('article', 'mention', None)
        ]
        candidates = tags + util.extract_links(content) + obj.get(
            'upstreamDuplicates', [])

        # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
        # references to canonical copies of a given (usually syndicated) post, of
        # the form (DOMAIN PATH). We consider them an explicit original post link.
        candidates += [
            match.expand(r'http://\1/\2')
            for match in Source._PERMASHORTCITATION_RE.finditer(content)
        ]

        candidates = set(
            filter(
                None,
                (
                    util.clean_url(url) for url in candidates
                    # heuristic: ellipsized URLs are probably incomplete, so omit them.
                    if url and not url.endswith('...')
                    and not url.endswith(u'…'))))

        # check for redirect and add their final urls
        redirects = {}  # maps final URL to original URL for redirects
        for url in list(candidates):
            resolved = util.follow_redirects(url, cache=cache, **kwargs)
            if (resolved.url != url and resolved.headers.get(
                    'content-type', '').startswith('text/html')):
                redirects[resolved.url] = url
                candidates.add(resolved.url)

        # use domains to determine which URLs are original post links vs mentions
        originals = set()
        mentions = set()
        for url in util.dedupe_urls(candidates):
            if url in redirects.values():
                # this is a redirected original URL. postpone and handle it when we hit
                # its final URL so that we know the final domain.
                continue
            domain = util.domain_from_link(url)
            which = (originals if not domains or util.domain_or_parent_in(
                domain, domains) else mentions)
            which.add(url)
            redirected_from = redirects.get(url)
            if redirected_from and include_redirect_sources:
                which.add(redirected_from)

        logging.info(
            'Original post discovery found original posts %s, mentions %s',
            originals, mentions)
        return originals, mentions

示例#16

0

显示文件

文件： twitter.py 项目： notenoughneon/activitystreams-unofficial

  def _create(self, obj, preview=None, include_link=False):
    """Creates or previews creating a tweet, reply tweet, retweet, or favorite.

    https://dev.twitter.com/docs/api/1.1/post/statuses/update
    https://dev.twitter.com/docs/api/1.1/post/statuses/retweet/:id
    https://dev.twitter.com/docs/api/1.1/post/favorites/create

    Args:
      obj: ActivityStreams object
      preview: boolean
      include_link: boolean

    Returns:
      a CreationResult

      If preview is True, the content will be a unicode string HTML
      snippet. If False, it will be a dict with 'id' and 'url' keys
      for the newly created Twitter object.
    """
    # TODO: validation, error handling
    assert preview in (False, True)
    type = obj.get('objectType')
    verb = obj.get('verb')
    base_id, base_url = self.base_object(obj)
    content = self._content_for_create(obj)
    if not content:
      if type == 'activity':
        content = verb
      else:
        return source.creation_result(
          abort=False,  # keep looking for things to publish,
          error_plain='No content text found.',
          error_html='No content text found.')

    is_reply = type == 'comment' or 'inReplyTo' in obj
    if is_reply and base_url:
      # extract username from in-reply-to URL so we can @-mention it, if it's
      # not already @-mentioned, since Twitter requires that to make our new
      # tweet a reply.
      # https://dev.twitter.com/docs/api/1.1/post/statuses/update#api-param-in_reply_to_status_id
      # TODO: this doesn't handle an in-reply-to username that's a prefix of
      # another username already mentioned, e.g. in reply to @foo when content
      # includes @foobar.
      parsed = urlparse.urlparse(base_url)
      parts = parsed.path.split('/')
      if len(parts) < 2 or not parts[1]:
        raise ValueError('Could not determine author of in-reply-to URL %s' % base_url)
      mention = '@' + parts[1]
      if mention not in content:
        content = mention + ' ' + content

      # the embed URL in the preview can't start with mobile. or www., so just
      # hard-code it to twitter.com. index #1 is netloc.
      parsed = list(parsed)
      parsed[1] = self.DOMAIN
      base_url = urlparse.urlunparse(parsed)

    # need a base_url with the tweet id for the embed HTML below. do this
    # *after* checking the real base_url for in-reply-to author username.
    if base_id and not base_url:
      base_url = 'https://twitter.com/-/statuses/' + base_id

    # truncate and ellipsize content if it's over the character count. URLs will
    # be t.co-wrapped, so include that when counting.
    links = set(util.extract_links(content))
    max = MAX_TWEET_LENGTH
    include_url = obj.get('url') if include_link else None
    if include_url:
      max -= TCO_LENGTH + 3

    length = 0
    tokens = content.split()
    for i, token in enumerate(tokens):
      # extract_links() strips trailing slashes from URLs, so do the same here
      # so we can compare.
      as_url = token[:-1] if token.endswith('/') else token
      length += (TCO_LENGTH if as_url in links else len(token))
      if i > 0:
        length += 1  # space between tokens
      if length > max:
        break
    else:
      i = len(tokens)

    # normalize whitespace
    # TODO: user opt in to preserve original whitespace (newlines, etc)
    content = ' '.join(tokens[:i])
    if i < len(tokens):
      content += u'…'
    if include_url:
      content += ' (%s)' % include_url
    # linkify defaults to Twitter's link shortening behavior
    preview_content = util.linkify(content, pretty=True)

    if is_reply:
      if not base_url:
        return source.creation_result(
          abort=True,
          error_plain='Could not find a tweet to reply to.',
          error_html='Could not find a tweet to <a href="http://indiewebcamp.com/reply">reply to</a>. '
          'Check that your post has an <a href="http://indiewebcamp.com/comment">in-reply-to</a> '
          'link a Twitter URL or to an original post that publishes a '
          '<a href="http://indiewebcamp.com/rel-syndication">rel-syndication</a> link to Twitter.')

      if preview:
        return source.creation_result(
          'will <span class="verb">@-reply</span>:<br /><br />\n<em>%s</em>\n'
          '<br /><br />...to <a href="%s">this tweet</a>:\n%s' %
          (preview_content, base_url, EMBED_TWEET % base_url))
      else:
        content = unicode(content).encode('utf-8')
        data = urllib.urlencode({'status': content, 'in_reply_to_status_id': base_id})
        resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read())
        resp['type'] = 'comment'

    elif type == 'activity' and verb == 'like':
      if not base_url:
        return source.creation_result(
          abort=True,
          error_plain='Could not find a tweet to like.',
          error_html='Could not find a tweet to <a href="http://indiewebcamp.com/favorite">favorite</a>. '
          'Check that your post has a like-of link to a Twitter URL or to an original post that publishes a '
          '<a href="http://indiewebcamp.com/rel-syndication">rel-syndication</a> link to Twitter.')

      if preview:
        return source.creation_result(
          'will <span class="verb">favorite</span> <a href="%s">this tweet</a>:\n%s' %
          (base_url, EMBED_TWEET % base_url))
      else:
        data = urllib.urlencode({'id': base_id})
        self.urlopen(API_POST_FAVORITE_URL, data=data).read()
        resp = {'type': 'like'}

    elif type == 'activity' and verb == 'share':
      if not base_url:
        return source.creation_result(
          abort=True,
          error_plain='Could not find a tweet to retweet.',
          error_html='Could not find a tweet to <a href="http://indiewebcamp.com/repost">retweet</a>. '
          'Check that your post has a repost-of link to a Twitter URL or to an original post that publishes a '
          '<a href="http://indiewebcamp.com/rel-syndication">rel-syndication</a> link to Twitter.')

      if preview:
        return source.creation_result(
          'will <span class="verb">retweet</span> <a href="%s">this tweet</a>:\n%s' %
          (base_url, EMBED_TWEET % base_url))
      else:
        data = urllib.urlencode({'id': base_id})
        resp = json.loads(self.urlopen(API_POST_RETWEET_URL % base_id, data=data).read())
        resp['type'] = 'repost'

    elif type in ('note', 'article') and obj.get('image'):
      image_url = obj.get('image').get('url')
      if preview:
        return source.creation_result(
          'will <span class="verb">tweet</span> with photo:<br /><br />'
          '<em>%s</em><br /><img src="%s"/><br />' % (preview_content, image_url))
      else:
        content = unicode(content).encode('utf-8')
        data = {'status': content}
        files = {'media[]': urllib2.urlopen(image_url)}
        headers = twitter_auth.auth_header(API_POST_MEDIA_URL,
            self.access_token_key, self.access_token_secret, 'POST')
        resp = json.loads(requests.post(API_POST_MEDIA_URL,
          data=data, files=files, headers=headers, timeout=HTTP_TIMEOUT).text)
        resp['type'] = 'post'

    elif type in ('note', 'article'):
      if preview:
        return source.creation_result(
          'will <span class="verb">tweet</span>:<br /><br />'
          '<em>%s</em><br />' % preview_content)
      else:
        content = unicode(content).encode('utf-8')
        data = urllib.urlencode({'status': content})
        resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read())
        resp['type'] = 'post'

    elif (verb and verb.startswith('rsvp-')) or verb == 'invite':
      return source.creation_result(
        abort=True,
        error_plain='Cannot publish RSVPs to Twitter.',
        error_html='This looks like an <a href="http://indiewebcamp.com/rsvp">RSVP</a>. '
        'Publishing events or RSVPs to Twitter is not supported.')

    else:
      return source.creation_result(
        abort=False,
        error_plain='Cannot publish type=%s, verb=%s to Twitter' % (type, verb),
        error_html='Cannot publish type=%s, verb=%s to Twitter' % (type, verb))

    id_str = resp.get('id_str')
    if id_str:
      resp.update({'id': id_str, 'url': self.tweet_url(resp)})
    elif 'url' not in resp:
      resp['url'] = base_url

    return source.creation_result(resp)

示例#17

0

显示文件

文件： source.py 项目： harixxy/granary

    def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs):
        """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns: ([string original post URLs], [string mention URLs]) tuple
    """
        obj = activity.get("object") or activity
        content = obj.get("content", "").strip()

        # find all candidate URLs
        tags = [
            t.get("url")
            for t in obj.get("attachments", []) + obj.get("tags", [])
            if t.get("objectType") in ("article", "mention", None)
        ]
        candidates = tags + util.extract_links(content) + obj.get("upstreamDuplicates", [])

        # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
        # references to canonical copies of a given (usually syndicated) post, of
        # the form (DOMAIN PATH). We consider them an explicit original post link.
        candidates += [match.expand(r"http://\1/\2") for match in Source._PERMASHORTCITATION_RE.finditer(content)]

        candidates = set(
            filter(
                None,
                (
                    util.clean_url(url)
                    for url in candidates
                    # heuristic: ellipsized URLs are probably incomplete, so omit them.
                    if url and not url.endswith("...") and not url.endswith(u"…")
                ),
            )
        )

        # check for redirect and add their final urls
        redirects = {}  # maps final URL to original URL for redirects
        for url in list(candidates):
            resolved = follow_redirects(url, cache=cache, **kwargs)
            if resolved.url != url and resolved.headers.get("content-type", "").startswith("text/html"):
                redirects[resolved.url] = url
                candidates.add(resolved.url)

        # use domains to determine which URLs are original post links vs mentions
        originals = set()
        mentions = set()
        for url in util.dedupe_urls(candidates):
            if url in redirects.values():
                # this is a redirected original URL. postpone and handle it when we hit
                # its final URL so that we know the final domain.
                continue
            which = originals if not domains or util.domain_from_link(url) in domains else mentions
            which.add(url)
            redirected_from = redirects.get(url)
            if redirected_from and include_redirect_sources:
                which.add(redirected_from)

        logging.info("Original post discovery found original posts %s, mentions %s", originals, mentions)
        return originals, mentions

示例#18

0

显示文件

文件： twitter.py 项目： bcomnes/activitystreams-unofficial

  def _create(self, obj, preview=None, include_link=False):
    """Creates or previews creating a tweet, reply tweet, retweet, or favorite.

    https://dev.twitter.com/docs/api/1.1/post/statuses/update
    https://dev.twitter.com/docs/api/1.1/post/statuses/retweet/:id
    https://dev.twitter.com/docs/api/1.1/post/favorites/create

    Args:
      obj: ActivityStreams object
      preview: boolean
      include_link: boolean

    Returns:
      If preview is True, a string HTML snippet. If False, a dict with 'id' and
      'url' keys for the newly created Twitter object.
    """
    # TODO: validation, error handling
    assert preview in (False, True)
    type = obj.get('objectType')
    verb = obj.get('verb')
    base_id, base_url = self.base_object(obj)
    content = obj.get('content', '').strip()

    is_reply = (type == 'comment' or 'inReplyTo' in obj) and base_url
    if is_reply:
      # extract username from in-reply-to URL so we can @-mention it, if it's
      # not already @-mentioned, since Twitter requires that to make our new
      # tweet a reply.
      # https://dev.twitter.com/docs/api/1.1/post/statuses/update#api-param-in_reply_to_status_id
      # TODO: this doesn't handle an in-reply-to username that's a prefix of
      # another username already mentioned, e.g. in reply to @foo when content
      # includes @foobar.
      parsed = urlparse.urlparse(base_url)
      parts = parsed.path.split('/')
      if len(parts) < 2 or not parts[1]:
        raise ValueError('Could not determine author of in-reply-to URL %s' % base_url)
      mention = '@' + parts[1]
      if mention not in content:
        content = mention + ' ' + content

      # the embed URL in the preview can't start with mobile. or www., so just
      # hard-code it to twitter.com. index #1 is netloc.
      parsed = list(parsed)
      parsed[1] = self.DOMAIN
      base_url = urlparse.urlunparse(parsed)

    # need a base_url with the tweet id for the embed HTML below. do this
    # *after* checking the real base_url for in-reply-to author username.
    if base_id and not base_url:
      base_url = 'https://twitter.com/-/statuses/' + base_id

    # truncate and ellipsize content if it's over the character count. URLs will
    # be t.co-wrapped, so include that when counting.
    links = set(util.extract_links(content))
    max = MAX_TWEET_LENGTH
    include_url = obj.get('url') if include_link else None
    if include_url:
      max -= TCO_LENGTH + 3

    length = 0
    tokens = content.split()
    for i, token in enumerate(tokens):
      # extract_links() strips trailing slashes from URLs, so do the same here
      # so we can compare.
      as_url = token[:-1] if token.endswith('/') else token
      length += (TCO_LENGTH if as_url in links else len(token))
      if i > 0:
        length += 1  # space between tokens
      if length > max:
        break
    else:
      i = len(tokens)

    # normalize whitespace
    # TODO: user opt in to preserve original whitespace (newlines, etc)
    content = ' '.join(tokens[:i])
    if i < len(tokens):
      content += u'…'
    if include_url:
      content += ' (%s)' % include_url
    content = unicode(content).encode('utf-8')
    # linkify defaults to Twitter's link shortening behavior
    preview_content = util.linkify(content, pretty=True)

    if is_reply:
      if preview:
        return ('will <span class="verb">@-reply</span>:<br /><br />\n<em>%s</em>\n'
                '<br /><br />...to <a href="%s">this tweet</a>:\n%s' %
                (preview_content, base_url, EMBED_TWEET % base_url))
      else:
        data = urllib.urlencode({'status': content, 'in_reply_to_status_id': base_id})
        resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read())
        resp['type'] = 'comment'

    elif type == 'activity' and verb == 'like':
      if preview:
        return ('will <span class="verb">favorite</span> <a href="%s">this tweet</a>:\n%s' %
                (base_url, EMBED_TWEET % base_url))
      else:
        data = urllib.urlencode({'id': base_id})
        self.urlopen(API_POST_FAVORITE_URL, data=data).read()
        resp = {'type': 'like'}

    elif type == 'activity' and verb == 'share':
      if preview:
        return ('will <span class="verb">retweet</span> <a href="%s">this tweet</a>:\n%s' %
                (base_url, EMBED_TWEET % base_url))
      else:
        data = urllib.urlencode({'id': base_id})
        resp = json.loads(self.urlopen(API_POST_RETWEET_URL % base_id, data=data).read())
        resp['type'] = 'repost'

    elif type in ('note', 'article', 'comment'):
      if preview:
        return ('will <span class="verb">tweet</span>:<br /><br />'
                '<em>%s</em><br />' % preview_content)
      else:
        data = urllib.urlencode({'status': content})
        resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read())
        resp['type'] = 'post'

    else:
      raise NotImplementedError()

    id_str = resp.get('id_str')
    if id_str:
      resp.update({'id': id_str, 'url': self.tweet_url(resp)})
    elif 'url' not in resp:
      resp['url'] = base_url
    return resp