def test_in_webmention_blocklist(self): for bad in 't.co', 'x.t.co', 'X.Y.T.CO', 'abc.onion': self.assertTrue(util.in_webmention_blocklist(bad), bad) for good in 'snarfed.org', 'www.snarfed.org', 't.co.com': self.assertFalse(util.in_webmention_blocklist(good), good) self.mox.StubOutWithMock(util, 'LOCAL') util.LOCAL = False self.assertTrue(util.in_webmention_blocklist('localhost')) util.LOCAL = True self.assertFalse(util.in_webmention_blocklist('localhost'))
def verify(self, force=False): """Checks that this source is ready to be used. For blog and listen sources, this fetches their front page HTML and discovers their webmention endpoint. For publish sources, this checks that they have a domain. May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`. Args: force: if True, fully verifies (e.g. re-fetches the blog's HTML and performs webmention discovery) even we already think this source is verified. """ author_urls = [u for u, d in zip(self.get_author_urls(), self.domains) if not util.in_webmention_blocklist(d)] if ((self.verified() and not force) or self.status == 'disabled' or not self.features or not author_urls): return author_url = author_urls[0] try: got = webmention.discover(author_url, timeout=util.HTTP_TIMEOUT) self.webmention_endpoint = got.endpoint self._fetched_html = got.response.text except BaseException as e: logger.info('Error discovering webmention endpoint', exc_info=e) self.webmention_endpoint = None self.put()
def verify(self, force=False): """Checks that this source is ready to be used. For blog and listen sources, this fetches their front page HTML and discovers their webmention endpoint. For publish sources, this checks that they have a domain. May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`. Args: force: if True, fully verifies (e.g. re-fetches the blog's HTML and performs webmention discovery) even we already think this source is verified. """ author_urls = [ u for u, d in zip(self.get_author_urls(), self.domains) if not util.in_webmention_blocklist(d) ] if ((self.verified() and not force) or self.status == 'disabled' or not self.features or not author_urls): return author_url = author_urls[0] logging.info('Attempting to discover webmention endpoint on %s', author_url) mention = send.WebmentionSend('https://brid.gy/', author_url) mention.requests_kwargs = { 'timeout': util.HTTP_TIMEOUT, 'headers': util.REQUEST_HEADERS } try: mention._discoverEndpoint() except BaseException as e: logging.info('Error discovering webmention endpoint', exc_info=e) mention.error = {'code': 'EXCEPTION'} self._fetched_html = getattr(mention, 'html', None) error = getattr(mention, 'error', None) endpoint = getattr(mention, 'receiver_endpoint', None) if error or not endpoint: logging.info("No webmention endpoint found: %s %r", error, endpoint) self.webmention_endpoint = None else: logging.info("Discovered webmention endpoint %s", endpoint) self.webmention_endpoint = endpoint self.put()
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Returns: sequence of ActivityStreams activity dicts """ urls = {util.schemeless(util.fragmentless(url), slashes=False) for url in self.domain_urls if not util.in_webmention_blocklist(util.domain_from_link(url))} if not urls: return [] # Search syntax: https://www.reddit.com/wiki/search url_query = ' OR '.join(f'site:"{u}" OR selftext:"{u}"' for u in urls) return self.get_activities( search_query=url_query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Returns: sequence of ActivityStreams activity dicts """ urls = set( util.schemeless(util.fragmentless(url), slashes=False) for url in self.domain_urls if not util.in_webmention_blocklist(util.domain_from_link(url))) if not urls: return [] url_query = ' OR '.join([f'"{u}"' for u in urls]) return self.get_activities(search_query=url_query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=True, fetch_likes=False, fetch_shares=False, count=50)
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Twitter search supports OR: https://dev.twitter.com/rest/public/search ...but it only returns complete(ish) results if we strip scheme from URLs, ie search for example.com instead of http://example.com/, and that also returns false positivies, so we check that the returned tweets actually have matching links. https://github.com/snarfed/bridgy/issues/565 Returns: sequence of ActivityStreams activity dicts """ urls = {util.schemeless(util.fragmentless(url), slashes=False) for url in self.domain_urls if not util.in_webmention_blocklist(util.domain_from_link(url))} if not urls: return [] query = ' OR '.join(sorted(urls)) candidates = self.get_activities( search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) # filter out retweets and search false positives that don't actually link to us results = [] for candidate in candidates: if candidate.get('verb') == 'share': continue obj = candidate['object'] tags = obj.get('tags', []) atts = obj.get('attachments', []) for url in urls: if (any(util.schemeless(t.get('url', ''), slashes=False).startswith(url) for t in tags + atts)): results.append(candidate) break return results
def test_in_webmention_blocklist(self): for bad in 't.co', 'x.t.co', 'x.y.t.co', 'abc.onion': self.assertTrue(util.in_webmention_blocklist(bad), bad) for good in 'snarfed.org', 'www.snarfed.org', 't.co.com': self.assertFalse(util.in_webmention_blocklist(good), good)