def fetch_mf2(self, url): """Fetches a URL and extracts its mf2 data. Side effects: sets self.entity.html on success, calls self.error() on errors. Args: url: string Returns: (requests.Response, mf2 data dict) on success, None on failure """ try: fetched = util.requests_get(url) fetched.raise_for_status() except BaseException as e: util.interpret_http_exception(e) # log exception return self.error('Could not fetch source URL %s' % url) if self.entity: self.entity.html = fetched.text # .text is decoded unicode string, .content is raw bytes. if the HTTP # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it # can look for a <meta> tag with a charset and decode. text = (fetched.text if 'charset' in fetched.headers.get('content-type', '') else fetched.content) doc = BeautifulSoup(text) # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2. contents = doc.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' doc = unicode(post) # parse microformats, convert to ActivityStreams data = parser.Parser(doc=doc, url=fetched.url).to_dict() logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2)) items = data.get('items', []) if not items or not items[0]: return self.error('No microformats2 data found in ' + fetched.url, data=data, html=""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """ % (fetched.url, util.pretty_link(fetched.url))) return fetched, data
def get(self, type, source_short_name, string_id, *ids): source_cls = SOURCES.get(source_short_name) if not source_cls: self.abort(400, "Source type '%s' not found. Known sources: %s" % (source_short_name, SOURCES)) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort(400, "%s %s not found" % (source_short_name, string_id)) format = self.request.get("format", "html") if format not in ("html", "json"): self.abort(400, "Invalid format %s, expected html or json" % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, "Invalid id %s" % id) label = "%s:%s %s %s" % (source_short_name, string_id, type, ids) logging.info("Fetching %s", label) try: obj = self.get_item(*ids) except Exception, e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if code: self.response.status_int = int(code) self.response.headers["Content-Type"] = "text/plain" self.response.write("%s error:\n%s" % (self.source.AS_CLASS.NAME, body)) return else: raise
def get(self): sources = {source.key.id(): source for source in Twitter.query()} if not sources: return # just auth as me or the first user. TODO: use app-only auth instead. auther = sources.get('schnarfed') or list(sources.values())[0] usernames = list(sources.keys()) users = [] for i in range(0, len(usernames), TWITTER_USERS_PER_LOOKUP): username_batch = usernames[i:i + TWITTER_USERS_PER_LOOKUP] url = TWITTER_API_USER_LOOKUP % ','.join(username_batch) try: users += auther.gr_source.urlopen(url) except Exception as e: code, body = util.interpret_http_exception(e) if not (code == '404' and len(username_batch) == 1): # 404 for a single user means they deleted their account. otherwise... raise updated = False for user in users: source = sources.get(user['screen_name']) if source: new_actor = auther.gr_source.user_to_actor(user) updated = maybe_update_picture(source, new_actor, self) if updated: util.CachedPage.invalidate('/users')
def get_activities_response(self, **kwargs): type = self.auth_entity.get().type kwargs.setdefault('fetch_events', True) kwargs.setdefault('fetch_news', type == 'user') kwargs.setdefault('event_owner_id', self.key.id()) # temporary workaround for http://github.com/snarfed/bridgy/issues/689 if self.key.id() == '10207093222641618': kwargs['count'] = 38 try: activities = super(FacebookPage, self).get_activities_response(**kwargs) except urllib2.HTTPError as e: code, body = util.interpret_http_exception(e) # use a function so any new exceptions (JSON decoding, missing keys) don't # clobber the original exception so we can re-raise it below. def dead_token(): try: err = json.loads(body)['error'] return (err.get('code') in DEAD_TOKEN_ERROR_CODES or err.get('error_subcode') in DEAD_TOKEN_ERROR_SUBCODES or err.get('message') in DEAD_TOKEN_ERROR_MESSAGES) except: logging.exception("Couldn't determine whether token is still valid") return False if code == '401': if not dead_token() and type == 'user': # ask the user to reauthenticate. if this API call fails, it will raise # urllib2.HTTPError instead of DisableSource, so that we don't disable # the source without notifying. # # TODO: for pages, fetch the owners/admins and notify them. self.gr_source.create_notification( self.key.id(), "Brid.gy's access to your account has expired. Click here to renew it now!", 'https://brid.gy/facebook/start') raise models.DisableSource() raise # update the resolved_object_ids and post_publics caches def parsed_post_id(id): parsed = gr_facebook.Facebook.parse_id(id) return parsed.post if parsed.post else id resolved = self._load_cache('resolved_object_ids') for activity in activities['items']: obj = activity.get('object', {}) obj_id = parsed_post_id(obj.get('fb_id')) ids = obj.get('fb_object_for_ids') if obj_id and ids: resolved[obj_id] = obj_id for id in ids: resolved[parsed_post_id(id)] = obj_id for activity in activities['items']: self.is_activity_public(activity) return activities
def handle_exception(self, e, debug): """A webapp2 exception handler that propagates HTTP exceptions into the response. Use this as a :meth:`webapp2.RequestHandler.handle_exception()` method by adding this line to your handler class definition:: handle_exception = handlers.handle_exception I originally tried to put this in a :class:`webapp2.RequestHandler` subclass, but it gave me this exception:: File ".../webapp2-2.5.1/webapp2_extras/local.py", line 136, in _get_current_object raise RuntimeError('no object bound to %s' % self.__name__) RuntimeError: no object bound to app These are probably related: * http://eemyop.blogspot.com/2013/05/digging-around-in-webapp2-finding-out.html * http://code.google.com/p/webapp-improved/source/detail?r=d962ac4625ce3c43a3e59fd7fc07daf8d7b7c46a """ code, body = util.interpret_http_exception(e) if code: self.response.set_status(int(code)) self.response.write('HTTP Error %s: %s' % (code, body)) elif util.is_connection_failure(e): self.response.set_status(502) self.response.write('Upstream server request failed: %s' % e) else: raise
def get_activities_response(self, **kwargs): kwargs.setdefault("fetch_events", True) kwargs.setdefault("event_owner_id", self.key.id()) try: return super(FacebookPage, self).get_activities_response(**kwargs) except urllib2.HTTPError as e: code, body = util.interpret_http_exception(e) # use a function so any new exceptions (JSON decoding, missing keys) don't # clobber the original exception so we can re-raise it below. def dead_token(): try: err = json.loads(body)["error"] return err["code"] in DEAD_TOKEN_ERROR_CODES or err["error_subcode"] in DEAD_TOKEN_ERROR_SUBCODES except: return False if code == "401": if not dead_token(): # ask the user to reauthenticate. if this API call fails, it will raise # urllib2.HTTPError instead of DisableSource, so that we don't disable # the source without notifying. self.gr_source.create_notification( self.key.id(), "Brid.gy's access to your account has expired. Click here to renew it now!", "https://brid.gy/facebook/start", ) raise models.DisableSource() raise
def get_activities_response(self, **kwargs): type = self.auth_entity.get().type kwargs.setdefault('fetch_events', True) kwargs.setdefault('fetch_news', type == 'user') kwargs.setdefault('event_owner_id', self.key.id()) try: activities = super(FacebookPage, self).get_activities_response(**kwargs) except urllib2.HTTPError as e: code, body = util.interpret_http_exception(e) # use a function so any new exceptions (JSON decoding, missing keys) don't # clobber the original exception so we can re-raise it below. def dead_token(): try: err = json.loads(body)['error'] return (err.get('code') in DEAD_TOKEN_ERROR_CODES or err.get('error_subcode') in DEAD_TOKEN_ERROR_SUBCODES or err.get('message') in DEAD_TOKEN_ERROR_MESSAGES) except: logging.exception( "Couldn't determine whether token is still valid") return False if code == '401': if not dead_token() and type == 'user': # ask the user to reauthenticate. if this API call fails, it will raise # urllib2.HTTPError instead of DisableSource, so that we don't disable # the source without notifying. # # TODO: for pages, fetch the owners/admins and notify them. self.gr_source.create_notification( self.key.id(), "Bridgy's access to your account has expired. Click here to renew it now!", 'https://brid.gy/facebook/start') raise models.DisableSource() raise # update the resolved_object_ids and post_publics caches def parsed_post_id(id): parsed = gr_facebook.Facebook.parse_id(id) return parsed.post if parsed.post else id resolved = self._load_cache('resolved_object_ids') for activity in activities['items']: obj = activity.get('object', {}) obj_id = parsed_post_id(obj.get('fb_id')) ids = obj.get('fb_object_for_ids') if obj_id and ids: resolved[obj_id] = obj_id for id in ids: resolved[parsed_post_id(id)] = obj_id for activity in activities['items']: self.is_activity_public(activity) return activities
def poll(self, source): """Actually runs the poll. Returns: dict of source property names and values to update (transactionally) """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) source_updates = {} # # Step 1: fetch activities # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: response = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning('Rate limited. Marking as error and finishing. %s', e) source_updates.update({'status': 'error', 'rate_limited': True}) return source_updates else: raise
def get(self): updated = False for source in self.source_query(): if source.features and source.status != 'disabled': logging.debug('checking for updated profile pictures for: %s', source.bridgy_url(self)) try: actor = source.gr_source.get_actor(self.user_id(source)) except requests.HTTPError as e: # Mastodon API returns HTTP 404 for deleted (etc) users util.interpret_http_exception(e) continue updated = maybe_update_picture(source, actor, self) if updated: util.CachedPage.invalidate('/users')
def post(self): source = self.load_source(param='key') module = self.OAUTH_MODULES[source.key.kind()] feature = util.get_required_param(self, 'feature') state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': source.key.urlsafe(), 'callback': self.request.get('callback'), }) # Blogger don't support redirect_url() yet if module is oauth_blogger_v2: return self.redirect('/blogger/delete/start?state=%s' % state) path = ('/instagram/callback' if module is indieauth else '/wordpress/add' if module is oauth_wordpress_rest else '/%s/delete/finish' % source.SHORT_NAME) kwargs = {} if module is oauth_twitter: kwargs['access_type'] = 'read' if feature == 'listen' else 'write' handler = module.StartHandler.to(path, **kwargs)(self.request, self.response) try: self.redirect(handler.redirect_url(state=state)) except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = unicode(e) if code: self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body)) self.redirect(source.bridgy_url(self)) else: raise
def post(self): logging.debug('Params: %s', self.request.params) key = util.get_required_param(self, 'source_key') source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: activities = source.get_activities( fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities: logging.info('Post %s not found.', post_id) return assert len(activities) == 1 self.backfeed(source, activities={activities[0]['id']: activities[0]}) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in util.HTTP_RATE_LIMIT_CODES or code == '400' or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort(400, "Source type '%s' not found. Known sources: %s" % (source_short_name, models.sources)) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort(400, '%s %s not found' % (source_short_name, string_id)) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except Exception, e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise
def do_post(self, source): if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities # try: response = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=memcache) except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': # TODO: also interpret oauth2client.AccessTokenRefreshError with # {'error': 'invalid_grant'} as disabled? it can mean the user revoked # access. it can also mean the token expired, or they deleted their # account, or even other things. # http://code.google.com/p/google-api-python-client/issues/detail?id=187#c1 msg = 'Unauthorized error: %s' % e logging.exception(msg) raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning('Rate limited. Marking as error and finishing. %s', e) source.status = 'error' return else: raise
def get_activities_response(self, **kwargs): kwargs.setdefault('fetch_events', True) kwargs.setdefault('fetch_news', self.auth_entity.get().type == 'user') kwargs.setdefault('event_owner_id', self.key.id()) try: return super(FacebookPage, self).get_activities_response(**kwargs) except urllib2.HTTPError as e: code, body = util.interpret_http_exception(e) # use a function so any new exceptions (JSON decoding, missing keys) don't # clobber the original exception so we can re-raise it below. def dead_token(): try: err = json.loads(body)['error'] return (err.get('code') in DEAD_TOKEN_ERROR_CODES or err.get('error_subcode') in DEAD_TOKEN_ERROR_SUBCODES or err.get('message') in DEAD_TOKEN_ERROR_MESSAGES) except: logging.exception("Couldn't determine whether token is still valid") return False if code == '401': if not dead_token(): # ask the user to reauthenticate. if this API call fails, it will raise # urllib2.HTTPError instead of DisableSource, so that we don't disable # the source without notifying. self.gr_source.create_notification( self.key.id(), "Brid.gy's access to your account has expired. Click here to renew it now!", 'https://brid.gy/facebook/start') raise models.DisableSource() raise
def post(self): source = self.load_source(param='key') kind = source.key.kind() feature = util.get_required_param(self, 'feature') state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': source.key.urlsafe().decode(), 'callback': self.request.get('callback'), }) # Blogger don't support redirect_url() yet if kind == 'Blogger': return self.redirect('/blogger/delete/start?state=%s' % state) path = ('/reddit/callback' if kind == 'Reddit' else '/wordpress/add' if kind == 'WordPress' else '/%s/delete/finish' % source.SHORT_NAME) kwargs = {} if kind == 'Twitter': kwargs['access_type'] = 'read' if feature == 'listen' else 'write' handler = source.OAUTH_START_HANDLER.to(path, **kwargs)(self.request, self.response) try: self.redirect(handler.redirect_url(state=state)) except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = str(e) if code: self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body)) self.redirect(source.bridgy_url(self)) else: raise
def create_comment(self, post_url, author_name, author_url, content): """Creates a new comment in the source silo. If the last part of the post URL is numeric, e.g. http://site/post/123999, it's used as the post id. Otherwise, we extract the last part of the path as the slug, e.g. http: / / site / post / the-slug, and look up the post id via the API. Args: post_url: string author_name: string author_url: string content: string Returns: JSON response dict with 'id' and other fields """ auth_entity = self.auth_entity.get() logging.info('Determining WordPress.com post id for %s', post_url) # extract the post's slug and look up its post id path = urllib.parse.urlparse(post_url).path if path.endswith('/'): path = path[:-1] slug = path.split('/')[-1] try: post_id = int(slug) except ValueError: logging.info('Looking up post id for slug %s', slug) url = API_POST_SLUG_URL % (auth_entity.blog_id, slug) post_id = self.urlopen(auth_entity, url).get('ID') if not post_id: return self.error('Could not find post id', report=False) logging.info('Post id is %d', post_id) # create the comment url = API_CREATE_COMMENT_URL % (auth_entity.blog_id, post_id) content = '<a href="%s">%s</a>: %s' % (author_url, author_name, content) data = {'content': content.encode()} try: resp = self.urlopen(auth_entity, url, data=urllib.parse.urlencode(data)) except urllib.error.HTTPError as e: code, body = util.interpret_http_exception(e) try: parsed = json_loads(body) if body else {} if ((code == '400' and parsed.get('error') == 'invalid_input') or (code == '403' and parsed.get('message') == 'Comments on this post are closed')): return parsed # known error: https://github.com/snarfed/bridgy/issues/161 except ValueError: pass # fall through raise e resp['id'] = resp.pop('ID', None) return resp
def post(self): try: self.redirect(self.redirect_url(state=util.get_required_param(self, 'token'))) except Exception as e: if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]: self.messages.add("Couldn't fetch your web site: %s" % e) return self.redirect('/') raise
def get_post(self, id): """Fetch a post. Args: id: string, site-specific post id is_event: bool Returns: ActivityStreams object dict """ try: posts = self.source.get_activities( activity_id=id, user_id=self.source.key.id()) if posts: return posts[0] logging.warning('Source post %s not found', id) except Exception as e: util.interpret_http_exception(e)
def finish(self, auth_entity, state=None): if auth_entity: user_json = json.loads(auth_entity.user_json) # find instagram profile URL urls = user_json.get('rel-me', []) logging.info('rel-mes: %s', urls) for url in util.trim_nulls(urls): if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN: username = urllib.parse.urlparse(url).path.strip('/') break else: self.messages.add( 'No Instagram profile found. Please <a href="https://indieauth.com/setup">add an Instagram rel-me link</a>, then try again.' ) return self.redirect('/') # check that instagram profile links to web site try: actor = gr_instagram.Instagram(scrape=True).get_actor( username, ignore_rate_limit=True) except Exception as e: code, _ = util.interpret_http_exception(e) if code in Instagram.RATE_LIMIT_HTTP_CODES: self.messages.add( '<a href="https://github.com/snarfed/bridgy/issues/665#issuecomment-524977427">Apologies, Instagram is temporarily blocking us.</a> Please try again later!' ) return self.redirect('/') else: raise if not actor: self.messages.add( "Couldn't find Instagram user '%s'. Please check your site's rel-me link and your Instagram account." % username) return self.redirect('/') canonicalize = util.UrlCanonicalizer(redirects=False) website = canonicalize(auth_entity.key.id()) urls = [canonicalize(u) for u in microformats2.object_urls(actor)] logging.info('Looking for %s in %s', website, urls) if website not in urls: self.messages.add( "Please add %s to your Instagram profile's website or bio field and try again." % website) return self.redirect('/') # check that the instagram account is public if not gr_source.Source.is_public(actor): self.messages.add( 'Your Instagram account is private. Bridgy only supports public accounts.' ) return self.redirect('/') self.maybe_add_or_delete_source(Instagram, auth_entity, state, actor=actor)
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code == '401' or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, exc_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) if source.is_beta_user(): util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body) elif code in util.HTTP_RATE_LIMIT_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True elif ((code and int(code) / 100 == 5) or (code == '400' and isinstance(source, flickr.Flickr)) or util.is_connection_failure(e)): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self, *path_args): self.request.headers['Content-Type'] = 'application/x-www-form-urlencoded' logging.debug('Params: %s', list(self.request.params.items())) key = self.request.params['source_key'] source = self.source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception as e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, stack_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) elif code in source.RATE_LIMIT_HTTP_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True else: raise finally: source = models.Source.put_updates(source) util.add_poll_task(source) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect()
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, exc_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) if source.is_beta_user(): util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body) elif code in source.RATE_LIMIT_HTTP_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True elif ((code and int(code) / 100 == 5) or (code == '400' and isinstance(source, flickr.Flickr)) or util.is_connection_failure(e)): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def get_post(self, id, **kwargs): """Fetch a post. Args: id: string, site-specific post id is_event: bool kwargs: passed through to :meth:`get_activities` Returns: ActivityStreams object dict """ try: posts = self.source.get_activities( activity_id=id, user_id=self.source.key_id(), **kwargs) if posts: return posts[0] logging.warning('Source post %s not found', id) except Exception as e: util.interpret_http_exception(e)
def post(self): ia_start = util.oauth_starter(indieauth.StartHandler).to('/instagram/callback')( self.request, self.response) try: self.redirect(ia_start.redirect_url(me=util.get_required_param(self, 'user_url'))) except Exception as e: if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]: self.messages.add("Couldn't fetch your web site: %s" % e) return self.redirect('/') raise
def post(self): ia_start = util.oauth_starter(indieauth.StartHandler).to('/instagram/callback')( self.request, self.response) try: self.redirect(ia_start.redirect_url(me=util.get_required_param(self, 'user_url'))) except Exception as e: if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]: self.messages.add("Couldn't fetch your web site: %s" % e) return self.redirect('/') raise
def get(self): # https://cloud.google.com/appengine/docs/standard/python/ndb/admin#Metadata_queries kinds = [k for k in metadata.get_kinds() if not k.startswith('_')] kinds.remove('Response') kinds.remove('SyndicatedPost') logging.info('Backing up %s', kinds) access_token, _ = app_identity.get_access_token( 'https://www.googleapis.com/auth/datastore') app_id = app_identity.get_application_id() request = { 'project_id': app_id, 'output_url_prefix': ('gs://brid-gy.appspot.com/weekly/' + datetime.datetime.now().strftime('%Y%m%d')), 'entity_filter': { 'kinds': kinds, # 'namespace_ids': self.request.get_all('namespace_id'), }, } headers = { 'Content-Type': 'application/json', 'Authorization': 'Bearer ' + access_token, } try: result = urlfetch.fetch( url='https://datastore.googleapis.com/v1/projects/%s:export' % app_id, payload=json_dumps(request), method=urlfetch.POST, headers=headers) if result.status_code == http.client.OK: logging.info(result.content) else: logging.error(result.content) self.abort(result.status_code) except urlfetch.Error as e: util.interpret_http_exception(e) raise
def create_comment(self, post_url, author_name, author_url, content): """Creates a new comment in the source silo. If the last part of the post URL is numeric, e.g. http://site/post/123999, it's used as the post id. Otherwise, we extract the last part of the path as the slug, e.g. http: / / site / post / the-slug, and look up the post id via the API. Args: post_url: string author_name: string author_url: string content: string Returns: JSON response dict with 'id' and other fields """ auth_entity = self.auth_entity.get() logging.info("Determining WordPress.com post id for %s", post_url) # extract the post's slug and look up its post id path = urlparse.urlparse(post_url).path if path.endswith("/"): path = path[:-1] slug = path.split("/")[-1] try: post_id = int(slug) except ValueError: logging.info("Looking up post id for slug %s", slug) url = API_POST_SLUG_URL % (auth_entity.blog_id, slug) post_id = self.urlopen(auth_entity, url).get("ID") if not post_id: return self.error("Could not find post id") logging.info("Post id is %d", post_id) # create the comment url = API_CREATE_COMMENT_URL % (auth_entity.blog_id, post_id) content = u'<a href="%s">%s</a>: %s' % (author_url, author_name, content) data = {"content": content.encode("utf-8")} try: resp = self.urlopen(auth_entity, url, data=urllib.urlencode(data)) except urllib2.HTTPError, e: code, body = util.interpret_http_exception(e) try: parsed = json.loads(body) if body else {} if (code == "400" and parsed.get("error") == "invalid_input") or ( code == "403" and parsed.get("message") == "Comments on this post are closed" ): return parsed # known error: https://github.com/snarfed/bridgy/issues/161 except ValueError: pass # fall through raise e
def post(self): logging.debug('Params: %s', self.request.params) type = self.request.get('type') if type: assert type in ('event', ) source = util.load_source(self) if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities(fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities or not activities[0]: logging.info('Post %s not found.', post_id) return assert len(activities) == 1, activities self.backfeed(source, activities={activities[0]['id']: activities[0]}) obj = activities[0].get('object') or activities[0] in_reply_to = util.get_first(obj, 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get( 'id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1]) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in source.RATE_LIMIT_HTTP_CODES or code in ('400', '404') or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning('Rate limited. Marking as error and finishing. %s', e) source.updates.update({'poll_status': 'error', 'rate_limited': True}) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise
def dispatch_request(self): token = request.form['token'] try: to_url = self.redirect_url(state=token) except Exception as e: if util.is_connection_failure(e) or util.interpret_http_exception( e)[0]: flash(f"Couldn't fetch your web site: {e}") return redirect('/') raise return redirect(to_url)
def get(self): # https://cloud.google.com/appengine/docs/standard/python/ndb/admin#Metadata_queries kinds = [k for k in metadata.get_kinds() if not k.startswith('_')] kinds.remove('Response') kinds.remove('SyndicatedPost') logging.info('Backing up %s', kinds) access_token, _ = app_identity.get_access_token( 'https://www.googleapis.com/auth/datastore') app_id = app_identity.get_application_id() request = { 'project_id': app_id, 'output_url_prefix': ('gs://brid-gy.appspot.com/weekly/' + datetime.datetime.now().strftime('%Y%m%d')), 'entity_filter': { 'kinds': kinds, # 'namespace_ids': self.request.get_all('namespace_id'), }, } headers = { 'Content-Type': 'application/json', 'Authorization': 'Bearer ' + access_token, } try: result = urlfetch.fetch( url='https://datastore.googleapis.com/v1/projects/%s:export' % app_id, payload=json.dumps(request), method=urlfetch.POST, headers=headers) if result.status_code == httplib.OK: logging.info(result.content) else: logging.error(result.content) self.abort(result.status_code) except urlfetch.Error as e: util.interpret_http_exception(e) raise
def dispatch_request(self): g.TRANSIENT_ERROR_HTTP_CODES = ( self.SOURCE_CLS.TRANSIENT_ERROR_HTTP_CODES + self.SOURCE_CLS.RATE_LIMIT_HTTP_CODES) query = self.SOURCE_CLS.query().order(self.SOURCE_CLS.key) last = LastUpdatedPicture.get_by_id(self.SOURCE_CLS.SHORT_NAME) if last and last.last: query = query.filter(self.SOURCE_CLS.key > last.last) results, _, more = query.fetch_page(PAGE_SIZE) for source in results: if source.features and source.status != 'disabled': user_id = self.user_id(source) logger.debug( f'checking for updated profile pictures for {source.bridgy_url()} {user_id}' ) try: actor = source.gr_source.get_actor(user_id) except BaseException as e: logging.debug('Failed', exc_info=True) # Mastodon API returns HTTP 404 for deleted (etc) users, and # often one or more users' Mastodon instances are down. code, _ = util.interpret_http_exception(e) if code: continue raise if not actor: logger.info(f"Couldn't fetch user") continue new_pic = actor.get('image', {}).get('url') if not new_pic or source.picture == new_pic: logger.info(f'No new picture found') continue @ndb.transactional() def update(): src = source.key.get() src.picture = new_pic src.put() logger.info( f'Updating profile picture from {source.picture} to {new_pic}' ) update() LastUpdatedPicture(id=self.SOURCE_CLS.SHORT_NAME, last=source.key if more else None).put() return 'OK'
def post(self): logging.debug('Params: %s', self.request.params) type = self.request.get('type') if type: assert type in ('event',) key = util.get_required_param(self, 'source_key') source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities( fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities or not activities[0]: logging.info('Post %s not found.', post_id) return assert len(activities) == 1, activities self.backfeed(source, activities={activities[0]['id']: activities[0]}) in_reply_to = util.get_first(activities[0]['object'], 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get('id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1]) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in util.HTTP_RATE_LIMIT_CODES or code in ('400', '404') or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort( 400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort( 400, 'Source %s %s not found' % (source_short_name, string_id)) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except Exception, e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME)
def post(self, *path_args): logging.debug("Params: %s", self.request.params) key = self.request.params["source_key"] source = ndb.Key(urlsafe=key).get() if not source or source.status == "disabled" or "listen" not in source.features: logging.error("Source not found or disabled. Dropping task.") return logging.info("Source: %s %s, %s", source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params["last_polled"] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning("duplicate poll task! deferring to the other task.") return logging.info( "Last poll: %s/log?start_time=%s&key=%s", self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe(), ) # mark this source as polling source.updates = {"poll_status": "polling", "last_poll_attempt": util.now_fn()} source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates["poll_status"] = "error" code, body = util.interpret_http_exception(e) if code == "401" or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning("Disabling source due to: %s" % e, exc_info=True) source.updates.update({"status": "disabled", "poll_status": "ok"}) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning("Rate limited. Marking as error and finishing. %s", e) source.updates["rate_limited"] = True elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error("API call failed. Marking as error and finishing. %s: %s\n%s", code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise
def post(self): key = ndb.Key(urlsafe=util.get_required_param(self, 'key')) module = self.OAUTH_MODULES[key.kind()] feature = util.get_required_param(self, 'feature') state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': key.urlsafe(), 'callback': self.request.get('callback'), }) # Google+ and Blogger don't support redirect_url() yet if module is oauth_googleplus: return self.redirect('/googleplus/delete/start?state=%s' % state) if module is oauth_blogger_v2: return self.redirect('/blogger/delete/start?state=%s' % state) source = key.get() path = ('/instagram/callback' if module is indieauth else '/wordpress/add' if module is oauth_wordpress_rest else '/%s/delete/finish' % source.SHORT_NAME) kwargs = {} if module is oauth_twitter: kwargs['access_type'] = 'read' if feature == 'listen' else 'write' handler = module.StartHandler.to(path, **kwargs)(self.request, self.response) try: self.redirect(handler.redirect_url(state=state)) except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = unicode(e) if code: self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body)) self.redirect(source.bridgy_url(self)) else: raise
def get_site_info(cls, auth_entity): """Fetches the site info from the API. Args: auth_entity: :class:`oauth_dropins.wordpress_rest.WordPressAuth` Returns: site info dict, or None if API calls are disabled for this blog """ try: return cls.urlopen(auth_entity, API_SITE_URL % auth_entity.blog_id) except urllib.error.HTTPError as e: code, body = util.interpret_http_exception(e) if (code == '403' and '"API calls to this blog have been disabled."' in body): flash(f'You need to <a href="http://jetpack.me/support/json-api/">enable the Jetpack JSON API</a> in {util.pretty_link(auth_entity.blog_url)}\'s WordPress admin console.') redirect('/') return None raise
def get_post(self, post_id, source_fn=None): """Utility method fetches the original post Args: post_id: string, site-specific post id source_fn: optional reference to a Source method, defaults to Source.get_post. Returns: ActivityStreams object dict """ try: post = (source_fn or self.source.get_post)(post_id) if not post: logging.warning('Source post %s not found', post_id) return post except Exception, e: # use interpret_http_exception to log HTTP errors if not util.interpret_http_exception(e)[0]: logging.warning( 'Error fetching source post %s', post_id, exc_info=True)
def get(self): sources = {source.key.id(): source for source in Twitter.query()} if not sources: return # just auth as me or the first user. TODO: use app-only auth instead. auther = sources.get('schnarfed') or sources.values()[0] usernames = sources.keys() users = [] for i in range(0, len(usernames), TWITTER_USERS_PER_LOOKUP): username_batch = usernames[i:i + TWITTER_USERS_PER_LOOKUP] url = TWITTER_API_USER_LOOKUP % ','.join(username_batch) try: users += auther.gr_source.urlopen(url) except Exception, e: code, body = util.interpret_http_exception(e) if not (code == '404' and len(username_batch) == 1): # 404 for a single user means they deleted their account. otherwise... raise
def get(self): sources = {source.key.id(): source for source in Twitter.query()} if not sources: return # just auth as me or the first user. TODO: use app-only auth instead. auther = sources.get('schnarfed') or sources.values()[0] usernames = sources.keys() users = [] for i in range(0, len(usernames), TWITTER_USERS_PER_LOOKUP): username_batch = usernames[i:i + TWITTER_USERS_PER_LOOKUP] url = TWITTER_API_USER_LOOKUP % ','.join(username_batch) try: users += auther.gr_source.urlopen(url) except Exception, e: code, body = util.interpret_http_exception(e) if not (code == '404' and len(username_batch) == 1): # 404 for a single user means they deleted their account. otherwise... raise
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort(400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort(400, 'Source %s %s not found' % (source_short_name, string_id)) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except Exception, e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME)
def get_site_info(cls, handler, auth_entity): """Fetches the site info from the API. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth Returns: site info dict, or None if API calls are disabled for this blog """ try: return cls.urlopen(auth_entity, API_SITE_URL % auth_entity.blog_id) except urllib2.HTTPError, e: code, body = util.interpret_http_exception(e) if (code == '403' and '"API calls to this blog have been disabled."' in body): handler.messages.add( 'You need to <a href="http://jetpack.me/support/json-api/">enable ' 'the Jetpack JSON API</a> in %s\'s WordPress admin console.' % util.pretty_link(auth_entity.blog_url)) handler.redirect('/') return None raise
def background_handle_exception(e): """Common exception handler for background tasks. Catches failed outbound HTTP requests and returns HTTP 304. """ if isinstance(e, HTTPException): # raised by this app itself, pass it through return str(e), e.code transients = getattr(g, 'TRANSIENT_ERROR_HTTP_CODES', ()) source = getattr(g, 'source', None) if source: transients += source.RATE_LIMIT_HTTP_CODES + source.TRANSIENT_ERROR_HTTP_CODES code, body = util.interpret_http_exception(e) if ((code and int(code) // 100 == 5) or code in transients or util.is_connection_failure(e)): logger.error(f'Marking as error and finishing. {code}: {body}\n{e}') return '', util.ERROR_HTTP_RETURN_CODE raise e
def get_post(self, id, is_event=False): """Fetch a post. Args: id: string, site-specific post id is_event: bool Returns: ActivityStreams object dict """ try: if is_event: post = self.source.gr_source.get_event(id) else: posts = self.source.get_activities(activity_id=id, user_id=self.source.key.id()) post = posts[0] if posts else None if not post: logging.warning("Source post %s not found", id) return post except Exception, e: # use interpret_http_exception to log HTTP errors if not util.interpret_http_exception(e)[0]: logging.warning("Error fetching source post %s", id, exc_info=True)
def poll(self, source): """Actually runs the poll. Returns: dict of source property names and values to update (transactionally) """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) source_updates = {} # # Step 1: fetch activities # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: response = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': # TODO: also interpret oauth2client.AccessTokenRefreshError with # {'error': 'invalid_grant'} as disabled? it can mean the user revoked # access. it can also mean the token expired, or they deleted their # account, or even other things. # http://code.google.com/p/google-api-python-client/issues/detail?id=187#c1 msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning('Rate limited. Marking as error and finishing. %s', e) source_updates.update({'status': 'error', 'rate_limited': True}) return source_updates else: raise
def get_site_info(cls, handler, auth_entity): """Fetches the site info from the API. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth Returns: site info dict, or None if API calls are disabled for this blog """ try: return cls.urlopen(auth_entity, API_SITE_URL % auth_entity.blog_id) except urllib2.HTTPError, e: code, body = util.interpret_http_exception(e) if (code == '403' and '"API calls to this blog have been disabled."' in body): handler.messages.add( 'You need to <a href="http://jetpack.me/support/json-api/">enable ' 'the Jetpack JSON API</a> in %s\'s WordPress admin console.' % util.pretty_link(auth_entity.blog_url)) handler.redirect('/') return None raise
def delete_start(): source = util.load_source() kind = source.key.kind() feature = request.form['feature'] state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': source.key.urlsafe().decode(), 'callback': request.values.get('callback'), }) # Blogger don't support redirect_url() yet if kind == 'Blogger': return redirect(f'/blogger/delete/start?state={state}') path = ('/reddit/callback' if kind == 'Reddit' else '/wordpress/add' if kind == 'WordPress' else f'/{source.SHORT_NAME}/delete/finish') kwargs = {} if kind == 'Twitter': kwargs['access_type'] = 'read' if feature == 'listen' else 'write' try: return redirect(source.OAUTH_START(path).redirect_url(state=state)) except werkzeug.exceptions.HTTPException: # raised by us, probably via self.error() raise except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = str(e) if code: flash(f'{source.GR_CLASS.NAME} API error {code}: {body}') return redirect(source.bridgy_url()) else: raise
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}' ) elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter( source_cls.domains == domain).fetch(100) if not sources: return self.error( "Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % { 'type': source_cls.GR_CLASS.NAME, 'domain': domain }) current_url = '' for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided. # look through each source to find the one with the closest match. for domain_url in source.domain_urls: if (url.lower().startswith(domain_url.lower().strip('/')) and len(domain_url) > len(current_url)): self.source = source current_url = domain_url if not self.source: return self.error( 'Publish is not enabled for your account. Please visit https://brid.gy and sign up!' ) content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!" ) # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error( "Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!" ) self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. resp = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) return self.error('Error: %s %s' % (body or '', e), status=code or 500, mail=True)
def fetch_mf2(self, url, id=None, require_mf2=True, raise_errors=False): """Fetches a URL and extracts its mf2 data. Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()` on errors. Args: url: string id: string, optional id of specific element to extract and parse. defaults to the whole page. require_mf2: boolean, whether to return error if no mf2 are found raise_errors: boolean, whether to let error exceptions propagate up or handle them Returns: (:class:`requests.Response`, mf2 data dict) on success, None on failure """ try: resp = util.requests_get(url) resp.raise_for_status() except werkzeug.exceptions.HTTPException: # raised by us, probably via self.error() raise except BaseException as e: if raise_errors: raise util.interpret_http_exception(e) # log exception self.error(f'Could not fetch source URL {url}') if self.entity: self.entity.html = resp.text # parse microformats soup = util.parse_html(resp) mf2 = util.parse_mf2(soup, url=resp.url, id=id) if id and not mf2: self.error(f'Got fragment {id} but no element found with that id.') # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2 and re-parse if not mf2.get('items'): contents = soup.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' # TODO: i should be able to pass post or contents[0] to mf2py instead # here, but it returns no items. mf2py bug? doc = str(post) mf2 = util.parse_mf2(doc, resp.url) logger.debug(f'Parsed microformats2: {json_dumps(mf2, indent=2)}') items = mf2.get('items', []) if require_mf2 and (not items or not items[0]): self.error('No microformats2 data found in ' + resp.url, data=mf2, html=f""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="{resp.url}">{util.pretty_link(resp.url)}</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """) return resp, mf2
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1].get('rels', {}).get('canonical', [])) if domains: self.source = (source_cls.query() .filter(source_cls.domains.IN(domains)) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urlparse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error('Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLACKLIST: if pattern.match(target_path): return self.error('%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data.get('items', [])) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json.dumps(self.entity.published))
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort( 400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort( 400, 'Source %s %s not found' % (source_short_name, string_id)) elif (self.source.status == 'disabled' or ('listen' not in self.source.features and 'email' not in self.source.features)): self.abort( 400, 'Source %s is disabled for backfeed' % self.source.bridgy_path()) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj and not appengine_config.DEBUG: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except models.DisableSource as e: self.abort( 401, "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!" ) except ValueError as e: self.abort(400, '%s error:\n%s' % (self.source.GR_CLASS.NAME, e)) except Exception as e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) # temporary, trying to debug a flaky test failure # eg https://circleci.com/gh/snarfed/bridgy/769 if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME) if not obj: self.abort(404, label) if self.source.is_blocked(obj): self.abort(410, 'That user is currently blocked') # use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, self) mf2_json = microformats2.object_to_json(obj, synthesize_content=False) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: silo_url = self.source.gr_source.user_url(parsed[1]) urls = author.get('properties', {}).setdefault('url', []) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) # write the response! self.response.headers['Access-Control-Allow-Origin'] = '*' if format == 'html': self.response.headers['Content-Type'] = 'text/html; charset=utf-8' url = obj.get('url', '') self.response.out.write( TEMPLATE.substitute({ 'refresh': (('<meta http-equiv="refresh" content="0;url=%s">' % url) if url else ''), 'url': url, 'body': microformats2.json_to_html(mf2_json), 'title': self.get_title(obj), })) elif format == 'json': self.response.headers[ 'Content-Type'] = 'application/json; charset=utf-8' self.response.out.write(json.dumps(mf2_json, indent=2))
def fetch_mf2(self, url, require_mf2=True, raise_errors=False): """Fetches a URL and extracts its mf2 data. Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()` on errors. Args: url: string require_mf2: boolean, whether to return error if no mf2 are found raise_errors: boolean, whether to let error exceptions propagate up or handle them Returns: (:class:`requests.Response`, mf2 data dict) on success, None on failure """ try: fetched = util.requests_get(url) fetched.raise_for_status() except BaseException as e: if raise_errors: raise util.interpret_http_exception(e) # log exception return self.error('Could not fetch source URL %s' % url) if self.entity: self.entity.html = fetched.text # .text is decoded unicode string, .content is raw bytes. if the HTTP # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it # can look for a <meta> tag with a charset and decode. text = (fetched.text if 'charset' in fetched.headers.get( 'content-type', '') else fetched.content) doc = util.beautifulsoup_parse(text) # parse microformats data = util.mf2py_parse(doc, fetched.url) # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2 and re-parse if not data.get('items'): contents = doc.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' doc = unicode(post) data = util.mf2py_parse(doc, fetched.url) logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2)) items = data.get('items', []) if require_mf2 and (not items or not items[0]): return self.error('No microformats2 data found in ' + fetched.url, data=data, html=""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """ % (fetched.url, util.pretty_link(fetched.url))) return fetched, data
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort(400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort(400, 'Source %s %s not found' % (source_short_name, string_id)) elif self.source.status == 'disabled' or 'listen' not in self.source.features: self.abort(400, 'Source %s is disabled for backfeed' % self.source.bridgy_path()) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except models.DisableSource as e: self.abort(401, "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!") except Exception as e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME) if not obj: self.abort(404, label) # use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, self) mf2_json = microformats2.object_to_json(obj, synthesize_content=False) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: silo_url = self.source.gr_source.user_url(parsed[1]) urls = author.get('properties', {}).setdefault('url', []) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) # write the response! self.response.headers['Access-Control-Allow-Origin'] = '*' if format == 'html': self.response.headers['Content-Type'] = 'text/html; charset=utf-8' self.response.out.write(TEMPLATE.substitute({ 'url': obj.get('url', ''), 'body': microformats2.json_to_html(mf2_json), 'title': self.get_title(obj), })) elif format == 'json': self.response.headers['Content-Type'] = 'application/json; charset=utf-8' self.response.out.write(json.dumps(mf2_json, indent=2))
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) if urlparse.urlparse(self.target_url).path in ('', '/'): return self.error('Home page webmentions are not currently supported.') # create BlogWebmention entity id = u'%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception, e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source!') self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=False) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}') elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.GR_CLASS.NAME, 'domain': domain}) for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: self.source = source break else: return self.error( 'Publish is not enabled for your account(s). Please visit %s and sign up!' % ' or '.join(s.bridgy_url(self) for s in sources)) content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = BeautifulSoup(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. resp = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) return self.error('Error: %s %s' % (body or '', e), status=code or 500, mail=True)
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,github,twitter}') elif source_cls == Instagram: return self.error('Sorry, %s is not supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain self.source = self._find_source(source_cls, url, domain) if not self.source: return # _find_source rendered the error content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urlparse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. self.entity = self.get_or_add_publish_entity(url) try: resp = self.fetch_mf2(url, raise_errors=True) except BaseException as e: status, body = util.interpret_http_exception(e) if status == '410': return self.delete(url) return self.error('Could not fetch source URL %s' % url) if not resp: return self.fetched, data = resp # create the Publish entity so we can store the result. if (self.entity.status == 'complete' and self.entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84") # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) if code in self.source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, or the token expired, so # disable this source. logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() # TODO: eventually drop this to just if source.is_beta_user(). leaving # for everyone right now for initial monitoring. util.email_me(subject='Bridgy Publish: disabled %s' % self.source.label(), body=body) if isinstance(e, (NotImplementedError, ValueError, urllib2.URLError)): code = '400' elif not code: raise msg = 'Error: %s %s' % (body or '', e) return self.error(msg, status=code, mail=code not in ('400', '404', '502', '503', '504'))
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', list(self.request.params.items())) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urllib.parse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in util.DOMAINS or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{flickr,github,mastodon,twitter}') elif source_cls == Instagram: return self.error('Sorry, %s is not supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain self.source = self._find_source(source_cls, url, domain) if not self.source: return # _find_source rendered the error content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urllib.parse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urllib.parse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. self.entity = self.get_or_add_publish_entity(url) try: resp = self.fetch_mf2(url, raise_errors=True) except BaseException as e: status, body = util.interpret_http_exception(e) if status == '410': return self.delete(url) return self.error('Could not fetch source URL %s' % url) if not resp: return self.fetched, mf2 = resp # create the Publish entity so we can store the result. if (self.entity.status == 'complete' and self.entity.type != 'preview' and not self.PREVIEW and not appengine_info.LOCAL): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't support updating existing posts. Details: https://github.com/snarfed/bridgy/issues/84", extra_json={'original': self.entity.published}) # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 shortlinks = mf2['rels'].get('shortlink') if shortlinks: self.shortlink = urllib.parse.urljoin(url, shortlinks[0]) # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(mf2.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException as e: code, body = util.interpret_http_exception(e) if code in self.source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, or the token expired, so # disable this source. logging.warning('Disabling source due to: %s' % e, stack_info=True) self.source.status = 'disabled' self.source.put() # util.email_me(subject='Bridgy Publish: disabled %s' % self.source.label(), # body=body) if isinstance(e, (NotImplementedError, ValueError, urllib.error.URLError)): code = '400' elif not code: raise msg = 'Error: %s %s' % (body or '', e) return self.error(msg, status=code, report=code not in ('400', '404', '502', '503', '504')) if not self.entity.published: # tried all the items types.discard('h-entry') types.discard('h-note') if types: msg = ("%s doesn't support type(s) %s, or no content was found." % (source_cls.GR_CLASS.NAME, ' + '.join(types))) else: msg = 'Could not find content in <a href="http://microformats.org/wiki/h-entry">h-entry</a> or any other element!' return self.error(msg, data=mf2) # write results to datastore, but don't overwrite a previous publish with a # preview. if not (self.PREVIEW and self.entity.type != 'preview'): self.entity.status = 'complete' self.entity.put() return result
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise
def post(self, source_short_name): logging.info('Params: %s', list(self.request.params.items())) # strip fragments from source and target url self.source_url = urllib.parse.urldefrag( util.get_required_param(self, 'source'))[0] self.target_url = urllib.parse.urldefrag( util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query().filter( source_cls.domains == domain).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1]['rels'].get('canonical', [])) if domains: self.source = (source_cls.query().filter( source_cls.domains.IN(domains)).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urllib.parse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error( 'Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLOCKLIST: if pattern.match(target_path): return self.error( '%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe().decode()) # fetch source page fetched = self.fetch_mf2(self.source_url) if not fetched: return resp, mf2 = fetched item = self.find_mention_item(mf2.get('items', [])) if not item: return self.error( 'Could not find target URL %s in source page %s' % (self.target_url, resp.url), data=mf2, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, str): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][ 0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, stack_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, report=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, report=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, report=False) elif code or body: return self.error(msg, status=code, report=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json_dumps(self.entity.published))