def process(func, *args, **kwargs): def wrapper(*args, **kwargs): url, urlhash, status, domain, content = func(*args, **kwargs) if not content or not isinstance(content, unicode): return [] try: url = url.encode('utf8') except Exception, e: print e rootdomain = tldextracter.extract_rootdomain(url) if not rootdomain: return [] try: absolute_content = lxml.html.make_links_absolute(content, url) tree = lxml.html.fromstring(absolute_content) except Exception, e: print e return []
def filter_recent(r, jobs): filtered_jobs = [] for job in jobs: try: task = json.loads(job) url = task['url'] del task rootdomain = tldextracter.extract_rootdomain(url) except: r.lpush(config.QUEUE, job) try: if not r.exists('%s_status' % rootdomain): r.set('%s_status' % rootdomain, '') r.expire('%s_status' % rootdomain, config.EXPIRE) filtered_jobs.append(job) else: r.lpush(config.QUEUE, job) except Exception, e: print e return jobs
`domain` VARCHAR(100) NOT NULL DEFAULT '', `url` VARCHAR(512) NOT NULL DEFAULT '' ) Engine=INNoDB DEFAULT CHARSET=utf8; ''' pass def insert_site(name, language, url): if isinstance(url, unicode): try: url = url.strip().encode('utf8') except Exception, e: print e pass rootdomain = tldextracter.extract_rootdomain(url) domainhash = cityhash.CityHash64(rootdomain) query = PySQLPool.getNewQuery(connection) query.Query('''select domainhash from news_sites where domainhash = %s;''', domainhash) if query.record: return False else: query.Query('insert into news_sites(domainhash, language, name, ' 'domain, url) values(%s, %s, %s, %s, %s);', (domainhash, language, name, rootdomain, url)) query.Pool.Commit() return True if __name__ == '__main__':