def s3_put_cache(b64url, url_hash, batch_id, groups, content, refresh=False): """ :param groups: e.g. 'g1, g2' """ def put_cache(batch_key, filename, content): try: ret = S3.Object(batch_key, filename).put(Body=content) if ret['ResponseMetadata']['HTTPStatusCode'] == 200: return {'success': True} except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'NoSuchBucket': ret = create_bucket(batch_key) if type(ret) == str: return {'error': ret} return {'error': e.response['Error']['Code']} except Exception as e: return {'error': e} try: filename = '{}_{}'.format(batch_id, url_hash) batch_key = batch_id.rsplit('-', 1)[0] ret = put_cache(batch_key, filename, content) if 'error' in ret and ret['error'] == 'NoSuchBucket': ret = put_cache(batch_key, filename, content) if 'error' in ret: return {'success': False, 'error': ret['error']} _groups = str(map(string.strip, groups.split(',')))[1:-1].replace('\'', '"') sql1 = ( "insert into accessed (batch_id, groups, status, b64url, url_hash) " "values ('{}', '{{{}}}', '{}', '{}', '{}');".format( batch_id, _groups, 0, b64url, url_hash)) sql2 = ("insert into cached (b64url, url_hash) values ('{}', '{}')" ";".format(b64url, url_hash)) dbwrapper.execute(sql1) dbwrapper.execute(sql2) now = datetime.now() log_line = json.dumps({ 'date': str(now), 'batch_id': batch_id, 'groups': groups, 'url': base64.urlsafe_b64decode(b64url), }) cachelog.get_logger(batch_id, now.strftime('%Y%m%d'), FSCACHEDIR).info(log_line) except Exception as e: return {'success': False, 'error': e} return {'success': True}
def ufile_set_cache(b64url, url_hash, batch_id, groups, content, refresh=False): if not hasattr(ufile_set_cache, '_auth'): put_auth = putufile.PutUFile(public_key, private_key) setattr(ufile_set_cache, '_auth', put_auth) try: sio = StringIO(content) filename = '{}_{}'.format(batch_id, url_hash) batch_key = batch_id.rsplit('-', 1)[0] ret, resp = ufile_set_cache._auth.putstream(batch_key, filename, sio) if resp.status_code != 200: if resp.status_code == 400: if json.loads(resp.content)[u'ErrMsg'] == u'bucket not exist': ensure_bucket(batch_key) raise Exception( '{} bucket not exist, create, upload again'.format( batch_key)) raise Exception('{} upload ufile error: {}'.format( batch_id, b64url)) _groups = str(map(string.strip, groups.split(',')))[1:-1].replace('\'', '"') sql1 = ( "insert into accessed (batch_id, groups, status, b64url, url_hash) " "values ('{}', '{{{}}}', '{}', '{}', '{}');".format( batch_id, _groups, 0, b64url, url_hash)) sql2 = ("insert into cached (b64url, url_hash) values ('{}', '{}')" ";".format(b64url, url_hash)) dbwrapper.execute(sql1) dbwrapper.execute(sql2) now = datetime.now() log_line = json.dumps({ 'date': str(now), 'batch_id': batch_id, 'groups': groups, 'url': base64.urlsafe_b64decode(b64url), }) cachelog.get_logger(batch_id, now.strftime('%Y%m%d'), FSCACHEDIR).info(log_line) except Exception as e: return {'success': False, 'error': e} return {'success': True}
def access_with_cache(qiniukey): """ qiniukey: unique sha256 of url + created_time """ sql = "select url, created_time from cached where url_hash='{}' order by created_time desc limit 1;".format( hashkey) ret = dbwrapper.execute(sql).results url, created_time = ret[0] qiniukey = hashlib.sha256(url + created_time).hexdigest() return download(qiniukey)
def db_set_cache(b64url, url_hash, batch_id, groups, content, refresh): """ compare content_hash, if new content_hash is the 404 or 502 or webpage """ try: content_hash = hashlib.sha1(content).hexdigest() sql1 = ("insert into accessed (batch_id, status, b64url, url_hash) " "values ('{}', '{}', '{}', '{}');".format( batch_id, 0, b64url, url_hash)) sql2 = ( "with inserted as (" "insert into cached (b64url, url_hash, content_hash) values ('{}', '{}', '{}')" " RETURNING id )" "insert into contents (cached_id, content) values ((select id from inserted), '{}');" "".format(b64url, url_hash, content_hash, base64.standard_b64encode(content))) dbwrapper.execute(sql1) dbwrapper.execute(sql2) except Exception as e: return {'success': False, 'error': e} return {'success': True}
def db_get_cache(url_hash): sql1 = ("select content from contents where cached_id = " "(select id from cached where url_hash='{}' " "order by created_time desc limit 1);".format(url_hash)) sql2 = ("select content from contents as a " "inner join cached as b on a.cached_id=b.id " "where b.url_hash='{}';".format(url_hash)) try: # RowResult(columns=['content'], results=[('WkhWdFpTNWpiMjA9',)]) ret = dbwrapper.execute(sql2, result=True) if ret.results == []: return {'success': False} html = base64.standard_b64decode(ret.results[0][0]) except Exception as e: return {'success': False, 'error': e} return {'success': True, 'content': html}
def db_get_all_cache(batch_id): """ get all distinct url from all these batches """ sql = ( "select max(url) as url, url_hash, max(content) as content, max(created_time) " "from (select max(c.content) as content, max(b.url) as url, b.url_hash, b.created_time " "from accessed as a " "left join cached as b on a.url_hash=b.url_hash " "inner join contents as c on b.id=c.cached_id " "where a.batch_id like '{}%' " "group by b.url_hash, b.created_time " "order by b.created_time desc) as result " "group by url_hash;".format(re.sub('\d+$', '', batch_id))) try: ret = dbwrapper.execute(sql).results if ret == []: return {'success': False} except Exception as e: return {'success': False, 'error': e} result = [(url, url_hash, content) for url, url_hash, content, _ in ret] return {'success': True, 'hash_content_pair': result}