def validate_link(url,whitelist=False): if url: url=sanitize_url(url) if url: if whitelist and domain(url) not in DOMAIN_WHITELIST: print "Domain %s not in whitelist." % domain(url) return False try: lbu = LinksByUrl._byID(LinksByUrl._key_from_url(url)) except tdb_cassandra.NotFound: return url link_id36s = lbu._values() links = Link._byID36(link_id36s, data=True, return_dict=False) links = [l for l in links if not l._deleted] if len(links)==0: return url print "Link %s exists..." % url return False
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc("_date"), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, "url", "self") != "self" and not getattr(l, "is_self", False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})