def process_message(msgs, chan): """Update get_domain_links(), the Links by domain precomputed query. get_domain_links() is a CachedResult which is stored in permacache. To update these objects we need to do a read-modify-write which requires obtaining a lock. Sharding these updates by domain allows us to run multiple consumers (but ideally just one per shard) to avoid lock contention. """ from v1.lib.db.queries import add_queries, get_domain_links link_names = {msg.body for msg in msgs} links = Link._by_fullname(link_names, return_dict=False) print 'Processing %r' % (links,) links_by_domain = defaultdict(list) for link in links: parsed = UrlParser(link.url) # update the listings for all permutations of the link's domain for domain in parsed.domain_permutations(): links_by_domain[domain].append(link) for d, links in links_by_domain.iteritems(): with g.stats.get_timer("link_vote_processor.domain_queries"): add_queries( queries=[ get_domain_links(d, sort, "all") for sort in SORTS], insert_items=links, )
def _get_scrape_url(link): if not link.is_self: sr_name = link.subverbify_slow.name if not feature.is_enabled("imgur_gif_conversion", subverbify=sr_name): return link.url p = UrlParser(link.url) # If it's a gif link on imgur, replacing it with gifv should # give us the embedly friendly video url if is_subdomain(p.hostname, "imgur.com"): if p.path_extension().lower() == "gif": p.set_extension("gifv") return p.unparse() return link.url urls = extract_urls_from_markdown(link.selftext) second_choice = None for url in urls: p = UrlParser(url) if p.is_verbify_url(): continue # If we don't find anything we like better, use the first image. if not second_choice: second_choice = url # This is an optimization for "proof images" in AMAs. if is_subdomain(p.netloc, 'imgur.com') or p.has_image_extension(): return url return second_choice
def test_default_prefix(self): u = UrlParser('http://i.verbify.com/r/verbifydev') u.switch_subdomain_by_extension() self.assertEquals('http://www.verbify.com/r/verbifydev', u.unparse()) u = UrlParser('http://i.verbify.com/r/verbifydev') u.switch_subdomain_by_extension('does-not-exist') self.assertEquals('http://www.verbify.com/r/verbifydev', u.unparse())
def allowed_media_preview_url(url): p = UrlParser(url) if p.has_static_image_extension(): return True for allowed_domain in g.media_preview_domain_whitelist: if is_subdomain(p.hostname, allowed_domain): return True return False
def test_normal_urls(self): u = UrlParser('http://www.verbify.com/r/verbifydev') u.switch_subdomain_by_extension('compact') result = u.unparse() self.assertEquals('http://i.verbify.com/r/verbifydev', result) u = UrlParser(result) u.switch_subdomain_by_extension('mobile') result = u.unparse() self.assertEquals('http://simple.verbify.com/r/verbifydev', result)
def add_to_domain_query_q(link): parsed = UrlParser(link.url) if not parsed.domain_permutations(): # no valid domains found return if g.shard_domain_query_queues: domain_shard = hash(parsed.hostname) % 10 queue_name = "domain_query_%s_q" % domain_shard else: queue_name = "domain_query_q" amqp.add_item(queue_name, link._fullname)
def test_url_mutation(self): u = UrlParser("http://example.com/") u.hostname = g.domain self.assertTrue(u.is_verbify_url()) u = UrlParser("http://%s/" % g.domain) u.hostname = "example.com" self.assertFalse(u.is_verbify_url())
def validate_secure_oembed(self, oembed): """Check the "secure" embed is safe to embed, and not a placeholder""" if not oembed.get("html"): return False # Get the embed.ly iframe's src iframe_src = lxml.html.fromstring(oembed['html']).get('src') if not iframe_src: return False iframe_src_url = UrlParser(iframe_src) # Per embed.ly support: If the URL for the provider is HTTP, we're # gonna get a placeholder image instead provider_src_url = UrlParser(iframe_src_url.query_dict.get('src')) return not provider_src_url.scheme or provider_src_url.scheme == "https"
def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: if link.url: domains = UrlParser(link.url).domain_permutations() else: domains = [] ups, downs = link.ups, link.downs for tkey, oldest in oldests.iteritems(): if timestamp > oldest: sc = score(ups, downs) contr = controversy(ups, downs) h = _hot(ups, downs, timestamp) for domain in domains: yield ('domain/top/%s/%s' % (tkey, domain), sc, timestamp, fname) yield ('domain/controversial/%s/%s' % (tkey, domain), contr, timestamp, fname) if tkey == "all": yield ('domain/hot/%s/%s' % (tkey, domain), h, timestamp, fname) yield ('domain/new/%s/%s' % (tkey, domain), timestamp, timestamp, fname)
def _oembed_post(thing, **embed_options): subverbify = thing.subverbify_slow if (not can_view_link_comments(thing) or subverbify.type in Subverbify.private_types): raise ForbiddenError(errors.POST_NOT_ACCESSIBLE) live = '' if embed_options.get('live'): time = datetime.now(g.tz).isoformat() live = 'data-card-created="{}"'.format(time) script = '' if not embed_options.get('omitscript', False): script = format_html(SCRIPT_TEMPLATE, embedly_script=EMBEDLY_SCRIPT, ) link_url = UrlParser(thing.make_permalink_slow(force_domain=True)) link_url.update_query(ref='share', ref_source='embed') author_name = "" if not thing._deleted: author = thing.author_slow if author._deleted: author_name = _("[account deleted]") else: author_name = author.name html = format_html(POST_EMBED_TEMPLATE, live_data_attr=live, link_url=link_url.unparse(), title=websafe(thing.title), subverbify_url=make_url_https(subverbify.path), subverbify_name=subverbify.name, script=script, ) oembed_response = dict(_OEMBED_BASE, type="rich", title=thing.title, author_name=author_name, html=html, ) return oembed_response
def GET_quarantine(self, dest): sr = UrlParser(dest).get_subverbify() # if dest doesn't include a quarantined subverbify, # redirect to the homepage or the original destination if not sr: return self.redirect('/') elif isinstance(sr, FakeSubverbify) or sr.is_exposed(c.user): return self.redirect(dest) errpage = InterstitialPage( _("quarantined"), content=QuarantineInterstitial( sr_name=sr.name, logged_in=c.user_is_loggedin, email_verified=c.user_is_loggedin and c.user.email_verified, ), ) request.environ['usable_error_content'] = errpage.render() self.abort403()
def process(thing): if thing.deleted: return thing_cls = thingcls_by_name[thing.thing_type] fname = make_fullname(thing_cls, thing.thing_id) thing_score = score(thing.ups, thing.downs) thing_controversy = controversy(thing.ups, thing.downs) for interval, cutoff in cutoff_by_interval.iteritems(): if thing.timestamp < cutoff: continue yield ("user/%s/top/%s/%d" % (thing.thing_type, interval, thing.author_id), thing_score, thing.timestamp, fname) yield ("user/%s/controversial/%s/%d" % (thing.thing_type, interval, thing.author_id), thing_controversy, thing.timestamp, fname) if thing.spam: continue if thing.thing_type == "link": yield ("sr/link/top/%s/%d" % (interval, thing.sr_id), thing_score, thing.timestamp, fname) yield ("sr/link/controversial/%s/%d" % (interval, thing.sr_id), thing_controversy, thing.timestamp, fname) if thing.url: try: parsed = UrlParser(thing.url) except ValueError: continue for domain in parsed.domain_permutations(): yield ("domain/link/top/%s/%s" % (interval, domain), thing_score, thing.timestamp, fname) yield ("domain/link/controversial/%s/%s" % (interval, domain), thing_controversy, thing.timestamp, fname)
def POST_options(self, all_langs, **prefs): if feature.is_enabled("autoexpand_media_previews"): validator = VOneOf('media_preview', ('on', 'off', 'subverbify')) value = request.params.get('media_preview') prefs["pref_media_preview"] = validator.run(value) u = UrlParser(c.site.path + "prefs") filter_prefs(prefs, c.user) if c.errors.errors: for error in c.errors.errors: if error[1] == 'stylesheet_override': u.update_query(error_style_override=error[0]) else: u.update_query(generic_error=error[0]) return self.redirect(u.unparse()) set_prefs(c.user, prefs) c.user._commit() u.update_query(done='true') return self.redirect(u.unparse())
def format_output_url(cls, url, **kw): """ Helper method used during redirect to ensure that the redirect url (assisted by frame busting code or javasctipt) will point to the correct domain and not have any extra dangling get parameters. The extensions are also made to match and the resulting url is utf8 encoded. Node: for development purposes, also checks that the port matches the request port """ preserve_extension = kw.pop("preserve_extension", True) u = UrlParser(url) if u.is_verbify_url(): # make sure to pass the port along if not 80 if not kw.has_key('port'): kw['port'] = request.port # make sure the extensions agree with the current page if preserve_extension and c.extension: u.set_extension(c.extension) # unparse and encode it un utf8 rv = _force_unicode(u.unparse()).encode('utf8') if "\n" in rv or "\r" in rv: abort(400) return rv
def _update_redirect_uri(base_redirect_uri, params, as_fragment=False): parsed = UrlParser(base_redirect_uri) if as_fragment: parsed.fragment = urlencode(params) else: parsed.update_query(**params) return parsed.unparse()
def OPTIONS_report_cache_poisoning(self): """Send CORS headers for cache poisoning reports.""" if "Origin" not in request.headers: return origin = request.headers["Origin"] parsed_origin = UrlParser(origin) if not is_subdomain(parsed_origin.hostname, g.domain): return response.headers["Access-Control-Allow-Origin"] = origin response.headers["Access-Control-Allow-Methods"] = "POST" response.headers["Access-Control-Allow-Headers"] = \ "Authorization, X-Loggit, " response.headers["Access-Control-Allow-Credentials"] = "false" response.headers['Access-Control-Expose-Headers'] = \ self.COMMON_VERBIFY_HEADERS
def test_sign_url(self): u = UrlParser('http://examples.imgix.net/frog.jpg?w=100') signed_url = self.provider._sign_url(u, 'abcdef') self.assertEqual( signed_url.unparse(), 'http://examples.imgix.net/frog.jpg?w=100&s=cd3bdf071108af73b15c21bdcee5e49c' ) u = UrlParser('http://examples.imgix.net/frog.jpg') u.update_query(w=100) signed_url = self.provider._sign_url(u, 'abcdef') self.assertEqual( signed_url.unparse(), 'http://examples.imgix.net/frog.jpg?w=100&s=cd3bdf071108af73b15c21bdcee5e49c' )
def test_different_protocols(self): u = UrlParser('http://example.com') u2 = UrlParser('https://example.com') self.assertNotEquals(u, u2)
def test_different_objects(self): u = UrlParser('http://example.com') self.assertNotEquals(u, None)
def test_unicode_query_params(self): u = UrlParser(u'http://example.com/?page=unicode:(') u2 = UrlParser('http://example.com/') u2.update_query(page=u'unicode:(') self.assertEquals(u, u2)
def _is_safe_verbify_url(self, url, subverbify=None): web_safe = UrlParser(url).is_web_safe_url() return web_safe and UrlParser(url).is_verbify_url(subverbify)
def test_same_url(self): u = UrlParser('http://example.com:8000/a;b?foo=bar&bar=baz#spam') u2 = UrlParser('http://example.com:8000/a;b?bar=baz&foo=bar#spam') self.assertEquals(u, u2) u3 = UrlParser('') u3.scheme = 'http' u3.hostname = 'example.com' u3.port = 8000 u3.path = '/a' u3.params = 'b' u3.update_query(foo='bar', bar='baz') u3.fragment = 'spam' self.assertEquals(u, u3)
def test_integer_query_params(self): u = UrlParser('http://example.com/?page=1234') u2 = UrlParser('http://example.com/') u2.update_query(page=1234) self.assertEquals(u, u2)
def test_different_fragments(self): u = UrlParser('http://example.com/') u2 = UrlParser('http://example.com/#foo') u3 = UrlParser('http://example.com/#bar') self.assertNotEquals(u, u2) self.assertNotEquals(u2, u3)
def test_different_queries(self): u = UrlParser('http://example.com/') u2 = UrlParser('http://example.com/?foo') u3 = UrlParser('http://example.com/?foo=bar') self.assertNotEquals(u, u2) self.assertNotEquals(u2, u3)
def test_different_params(self): u = UrlParser('http://example.com/') u2 = UrlParser('http://example.com/;foo') u3 = UrlParser('http://example.com/;bar') self.assertNotEquals(u, u2) self.assertNotEquals(u2, u3)
def test_different_paths(self): u = UrlParser('http://example.com') u2 = UrlParser('http://example.com/a') u3 = UrlParser('http://example.com/b') self.assertNotEquals(u, u2) self.assertNotEquals(u2, u3)
def test_different_ports(self): u = UrlParser('http://example.com') u2 = UrlParser('http://example.com:8000') u3 = UrlParser('http://example.com:8008') self.assertNotEquals(u, u2) self.assertNotEquals(u2, u3)
def add_ext_to_link(link): url = UrlParser(link.get('href')) if url.is_verbify_url(): link['href'] = add_sr(link.get('href'), sr_path=False)
def test_different_domains(self): u = UrlParser('http://example.com') u2 = UrlParser('http://example.org') self.assertNotEquals(u, u2)