Пример #1
0
    def get(self, request, *args, **kwargs):
        file_storage = S3FileStorage()
        path = request.GET.get("path")
        if path is None:
            return HttpResponseBadRequest("Expected a `path` parameter")

        if not path.startswith(file_storage.base_prefix):
            return HttpResponseNotFound()

        client = get_s3_client()
        try:
            file_object = client.get_object(Bucket=file_storage.bucket,
                                            Key=path)
        except ClientError as ex:
            try:
                return HttpResponse(
                    status=ex.response["ResponseMetadata"]["HTTPStatusCode"])
            except KeyError:
                return HttpResponseServerError()

        response = StreamingHttpResponseWithoutDjangoDbConnection(
            file_object["Body"].iter_chunks(chunk_size=65536),
            content_type=file_object["ContentType"],
        )
        response[
            "Content-Disposition"] = f'attachment; filename="{os.path.split(path)[-1].rpartition("!")[0]}"'
        response["Content-Length"] = file_object["ContentLength"]
        return response
Пример #2
0
    def get(self, request, *args, **kwargs):
        dataset = find_dataset(self.kwargs.get('dataset_uuid'), request.user)

        if not dataset.user_has_access(self.request.user):
            return HttpResponseForbidden()

        source_link = get_object_or_404(SourceLink,
                                        id=self.kwargs.get('source_link_id'),
                                        dataset=dataset)

        log_event(
            request.user,
            EventLog.TYPE_DATASET_SOURCE_LINK_DOWNLOAD,
            source_link.dataset,
            extra={
                'path': request.get_full_path(),
                **serializers.serialize('python', [source_link])[0],
            },
        )
        dataset.number_of_downloads = F('number_of_downloads') + 1
        dataset.save(update_fields=['number_of_downloads'])

        if source_link.link_type == source_link.TYPE_EXTERNAL:
            return HttpResponseRedirect(source_link.url)

        client = boto3.client('s3')
        try:
            file_object = client.get_object(Bucket=settings.AWS_UPLOADS_BUCKET,
                                            Key=source_link.url)
        except ClientError as ex:
            try:
                return HttpResponse(
                    status=ex.response['ResponseMetadata']['HTTPStatusCode'])
            except KeyError:
                return HttpResponseServerError()

        response = StreamingHttpResponseWithoutDjangoDbConnection(
            file_object['Body'].iter_chunks(chunk_size=65536),
            content_type=file_object['ContentType'],
        )
        response[
            'Content-Disposition'] = f'attachment; filename="{source_link.get_filename()}"'
        response['Content-Length'] = file_object['ContentLength']

        return response
Пример #3
0
def table_api_rows_POST(request, table_id):
    # POST request to support HTTP bodies from Google Data Studio: it doesn't
    # seem to be able to send GETs with bodies
    sourcetable = SourceTable.objects.get(id=table_id)
    request_dict = json.loads(request.body)
    column_names = [field['name'] for field in request_dict['fields']]

    def query_vars_search_after(fields_sql, schema_sql, table_sql,
                                primary_key_sql):
        search_after = request_dict['$searchAfter']

        return (
            sql.SQL('SELECT {},{} FROM {}.{} WHERE ({}) > ({}) ORDER BY {}').
            format(
                primary_key_sql,
                fields_sql,
                schema_sql,
                table_sql,
                primary_key_sql,
                sql.SQL(',').join(sql.Placeholder() * len(search_after)),
                primary_key_sql,
            ),
            tuple(search_after),
        )

    def query_vars_paginated(fields_sql, schema_sql, table_sql,
                             primary_key_sql):
        pagination = request_dict['pagination']
        limit = int(pagination['rowCount'])
        offset = (int(pagination['startRow']) - 1
                  )  # Google Data Studio start is 1-indexed

        return (
            sql.SQL('''
            SELECT {},{} FROM {}.{} ORDER BY {} LIMIT %s OFFSET %s
        ''').format(primary_key_sql, fields_sql, schema_sql, table_sql,
                    primary_key_sql),
            (limit, offset),
        )

    def query_vars_non_paginated(fields_sql, schema_sql, table_sql,
                                 primary_key_sql):
        return (
            sql.SQL('''
            SELECT {},{} FROM {}.{} ORDER BY {}
        ''').format(primary_key_sql, fields_sql, schema_sql, table_sql,
                    primary_key_sql),
            (),
        )

    # fmt: off
    query_vars = \
        query_vars_search_after if '$searchAfter' in request_dict else \
        query_vars_paginated if 'pagination' in request_dict else \
        query_vars_non_paginated
    # fmt: on

    schema_value_funcs = [
        (schema, value_func)
        for schema, value_func in schema_value_func_for_data_types(sourcetable)
        if schema['name'] in column_names
    ]

    # https://developers.google.com/apps-script/guides/services/quotas#current_limitations
    # URL Fetch response size: 50mb, and a bit of a buffer for http headers and $searchAfter
    num_bytes_max = 49_990_000

    # StreamingHttpResponse translates to HTTP/1.1 chunking performed by gunicorn. However,
    # we don't have any visibility on the actual bytes sent as part of the HTTP body, i.e. each
    # chunk header and footer. We also don't appear to be able to work-around it and implement
    # our own chunked-encoder that makes these things visible. The best thing we can do is make
    # a good guess as to what these are, and add their lengths to the total number of bytes sent
    def len_chunk_header(num_chunk_bytes):
        return len('%X\r\n' % num_chunk_bytes)

    len_chunk_footer = len('\r\n')

    chunk_size = 16384
    queue = []
    num_bytes_queued = 0
    num_bytes_sent = 0
    num_bytes_sent_and_queued = 0

    def yield_chunks(row_bytes):
        nonlocal queue
        nonlocal num_bytes_queued
        nonlocal num_bytes_sent
        nonlocal num_bytes_sent_and_queued

        queue.append(row_bytes)
        num_bytes_queued += len(row_bytes)
        num_bytes_sent_and_queued += len(row_bytes)

        while num_bytes_queued >= chunk_size:
            to_send_bytes = b''.join(queue)
            chunk, to_send_bytes = (
                to_send_bytes[:chunk_size],
                to_send_bytes[chunk_size:],
            )
            queue = [to_send_bytes] if to_send_bytes else []
            num_bytes_queued = len(to_send_bytes)
            num_bytes_sent += (len(chunk) + len_chunk_header(len(chunk)) +
                               len_chunk_footer)
            yield chunk

    def yield_remaining():
        if queue:
            yield b''.join(queue)

    def yield_schema_and_rows_bytes():
        try:
            yield from yield_chunks(
                b'{"schema":' +
                json.dumps(get_schema(schema_value_funcs)).encode('utf-8') +
                b',"rows":[')

            for i, (row, search_after) in enumerate(
                    get_rows(sourcetable, schema_value_funcs, query_vars)):
                yield from yield_chunks(
                    # fmt: off
                    b',' + json.dumps(row).encode('utf-8')
                    if i != 0 else json.dumps(row).encode('utf-8')
                    # fmt: on
                )

                if num_bytes_sent_and_queued > num_bytes_max:
                    yield from yield_chunks(
                        b'],"$searchAfter":' +
                        json.dumps(search_after).encode('utf-8') + b'}')
                    break
            else:
                yield from yield_chunks(b']}')

            yield from yield_remaining()
        except Exception:
            logger.exception('Error streaming to Google Data Studio')
            raise

    return StreamingHttpResponseWithoutDjangoDbConnection(
        yield_schema_and_rows_bytes(),
        content_type='application/json',
        status=200)
Пример #4
0
    def post(self, request, *args, **kwargs):
        if request.GET.get("download"):
            start = 0
            dataset = self.get_object()
            limit = (dataset.data_grid_download_limit
                     if dataset.data_grid_download_enabled else 5000)
            filters = json.loads(request.POST.get("filters", "{}"))
        else:
            start = int(request.POST.get("start", 0))
            limit = min(int(request.POST.get("limit", 100)), 100)
            filters = json.loads(request.body.decode("utf-8")).get(
                "filters", {})

        search_term = request.GET.get("q")
        filters = build_grid_filters(self._get_column_config(), filters)

        result_count = es_client.get_count(search_term,
                                           self._get_index_alias(),
                                           filters=filters)

        results_proxy = ResultsProxy(
            es_client=es_client,
            index_alias=self._get_index_alias(),
            phrase=search_term,
            count=result_count,
            filters=filters,
        )
        paginator = Paginator(results_proxy, limit)
        results = paginator.get_page(1 if start <= 0 else int(start / limit) +
                                     1)
        records = []
        if len(results) > 0 and "_source" in results[0]:
            records = [result["_source"] for result in results]

        if request.GET.get("download"):

            class PseudoBuffer:
                def write(self, value):
                    return value

            field_names = request.POST.getlist("columns")

            pseudo_buffer = PseudoBuffer()
            csv_writer = csv.DictWriter(pseudo_buffer,
                                        field_names,
                                        extrasaction="ignore",
                                        quoting=csv.QUOTE_NONNUMERIC)

            def data():
                yield csv_writer.writerow(
                    dict((name, name) for name in field_names))
                yield from (csv_writer.writerow(record) for record in records)

            filename = request.POST.get("export_file_name")
            response = StreamingHttpResponseWithoutDjangoDbConnection(
                data(),
                content_type="text/csv",
            )
            response[
                "Content-Disposition"] = f'attachment; filename="{filename}"'
            return response

        return JsonResponse({"total": result_count, "records": records})