예제 #1
0
def s3_put_cache(b64url, url_hash, batch_id, groups, content, refresh=False):
    """
    :param groups: e.g. 'g1, g2'
    """
    def put_cache(batch_key, filename, content):
        try:
            ret = S3.Object(batch_key, filename).put(Body=content)
            if ret['ResponseMetadata']['HTTPStatusCode'] == 200:
                return {'success': True}
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'NoSuchBucket':
                ret = create_bucket(batch_key)
                if type(ret) == str:
                    return {'error': ret}

            return {'error': e.response['Error']['Code']}
        except Exception as e:
            return {'error': e}

    try:
        filename = '{}_{}'.format(batch_id, url_hash)
        batch_key = batch_id.rsplit('-', 1)[0]

        ret = put_cache(batch_key, filename, content)
        if 'error' in ret and ret['error'] == 'NoSuchBucket':
            ret = put_cache(batch_key, filename, content)
        if 'error' in ret:
            return {'success': False, 'error': ret['error']}

        _groups = str(map(string.strip,
                          groups.split(',')))[1:-1].replace('\'', '"')
        sql1 = (
            "insert into accessed (batch_id, groups, status, b64url, url_hash) "
            "values ('{}', '{{{}}}', '{}', '{}', '{}');".format(
                batch_id, _groups, 0, b64url, url_hash))
        sql2 = ("insert into cached (b64url, url_hash) values ('{}', '{}')"
                ";".format(b64url, url_hash))
        dbwrapper.execute(sql1)
        dbwrapper.execute(sql2)

        now = datetime.now()
        log_line = json.dumps({
            'date': str(now),
            'batch_id': batch_id,
            'groups': groups,
            'url': base64.urlsafe_b64decode(b64url),
        })
        cachelog.get_logger(batch_id, now.strftime('%Y%m%d'),
                            FSCACHEDIR).info(log_line)
    except Exception as e:
        return {'success': False, 'error': e}
    return {'success': True}
예제 #2
0
def ufile_set_cache(b64url,
                    url_hash,
                    batch_id,
                    groups,
                    content,
                    refresh=False):
    if not hasattr(ufile_set_cache, '_auth'):
        put_auth = putufile.PutUFile(public_key, private_key)
        setattr(ufile_set_cache, '_auth', put_auth)

    try:
        sio = StringIO(content)
        filename = '{}_{}'.format(batch_id, url_hash)
        batch_key = batch_id.rsplit('-', 1)[0]
        ret, resp = ufile_set_cache._auth.putstream(batch_key, filename, sio)
        if resp.status_code != 200:

            if resp.status_code == 400:
                if json.loads(resp.content)[u'ErrMsg'] == u'bucket not exist':
                    ensure_bucket(batch_key)
                    raise Exception(
                        '{} bucket not exist, create, upload again'.format(
                            batch_key))

            raise Exception('{} upload ufile error: {}'.format(
                batch_id, b64url))

        _groups = str(map(string.strip,
                          groups.split(',')))[1:-1].replace('\'', '"')
        sql1 = (
            "insert into accessed (batch_id, groups, status, b64url, url_hash) "
            "values ('{}', '{{{}}}', '{}', '{}', '{}');".format(
                batch_id, _groups, 0, b64url, url_hash))
        sql2 = ("insert into cached (b64url, url_hash) values ('{}', '{}')"
                ";".format(b64url, url_hash))
        dbwrapper.execute(sql1)
        dbwrapper.execute(sql2)

        now = datetime.now()
        log_line = json.dumps({
            'date': str(now),
            'batch_id': batch_id,
            'groups': groups,
            'url': base64.urlsafe_b64decode(b64url),
        })
        cachelog.get_logger(batch_id, now.strftime('%Y%m%d'),
                            FSCACHEDIR).info(log_line)
    except Exception as e:
        return {'success': False, 'error': e}
    return {'success': True}
예제 #3
0
def access_with_cache(qiniukey):
    """
    qiniukey: unique sha256 of url + created_time
    """

    sql = "select url, created_time from cached where url_hash='{}' order by created_time desc limit 1;".format(
        hashkey)
    ret = dbwrapper.execute(sql).results

    url, created_time = ret[0]
    qiniukey = hashlib.sha256(url + created_time).hexdigest()
    return download(qiniukey)
예제 #4
0
def db_set_cache(b64url, url_hash, batch_id, groups, content, refresh):
    """ compare content_hash, if new content_hash is the 404 or 502 or webpage
    """
    try:
        content_hash = hashlib.sha1(content).hexdigest()
        sql1 = ("insert into accessed (batch_id, status, b64url, url_hash) "
                "values ('{}', '{}', '{}', '{}');".format(
                    batch_id, 0, b64url, url_hash))

        sql2 = (
            "with inserted as ("
            "insert into cached (b64url, url_hash, content_hash) values ('{}', '{}', '{}')"
            " RETURNING id )"
            "insert into contents (cached_id, content) values ((select id from inserted), '{}');"
            "".format(b64url, url_hash, content_hash,
                      base64.standard_b64encode(content)))

        dbwrapper.execute(sql1)
        dbwrapper.execute(sql2)
    except Exception as e:
        return {'success': False, 'error': e}
    return {'success': True}
예제 #5
0
def db_get_cache(url_hash):
    sql1 = ("select content from contents where cached_id = "
            "(select id from cached where url_hash='{}' "
            "order by created_time desc limit 1);".format(url_hash))
    sql2 = ("select content from contents as a "
            "inner join cached as b on a.cached_id=b.id "
            "where b.url_hash='{}';".format(url_hash))

    try:
        # RowResult(columns=['content'], results=[('WkhWdFpTNWpiMjA9',)])
        ret = dbwrapper.execute(sql2, result=True)
        if ret.results == []:
            return {'success': False}

        html = base64.standard_b64decode(ret.results[0][0])
    except Exception as e:
        return {'success': False, 'error': e}
    return {'success': True, 'content': html}
예제 #6
0
def db_get_all_cache(batch_id):
    """ get all distinct url from all these batches
    """
    sql = (
        "select max(url) as url, url_hash, max(content) as content, max(created_time) "
        "from (select max(c.content) as content, max(b.url) as url, b.url_hash, b.created_time "
        "from accessed as a "
        "left join cached as b on a.url_hash=b.url_hash "
        "inner join contents as c on b.id=c.cached_id "
        "where a.batch_id like '{}%' "
        "group by b.url_hash, b.created_time "
        "order by b.created_time desc) as result "
        "group by url_hash;".format(re.sub('\d+$', '', batch_id)))
    try:
        ret = dbwrapper.execute(sql).results
        if ret == []:
            return {'success': False}
    except Exception as e:
        return {'success': False, 'error': e}

    result = [(url, url_hash, content) for url, url_hash, content, _ in ret]
    return {'success': True, 'hash_content_pair': result}