def get(self, request, document_type_code: str, *_args, **_kwargs): start = time.time() try: document_type = DocumentType.objects.get(code=document_type_code) project_ids = as_int_list(request.GET, 'project_ids') # type: List[int] columns = as_str_list(request.GET, 'columns') include_annotations = as_bool(request.GET, 'associated_text') if include_annotations: all_annotation_columns = get_annotation_columns(document_type) columns += [i.field_code for i in all_annotation_columns if i.field_code.rstrip(FIELD_CODE_ANNOTATION_SUFFIX) in columns] fmt = request.GET.get('fmt') or self.FMT_JSON offset = as_int(request.GET, 'offset', None) if offset is not None and offset < 0: offset = None limit = as_int(request.GET, 'limit', None) if limit is not None and limit <= 0: limit = None # For json output we limit number of returned documents because we dont use streaming response for JSON # and want to keep it fast. if fmt == self.FMT_JSON and self.MAX_RETURNED_DOCUMENTS_JSON is not None \ and (limit is None or limit > self.MAX_RETURNED_DOCUMENTS_JSON): limit = self.MAX_RETURNED_DOCUMENTS_JSON saved_filters = as_int_list(request.GET, 'saved_filters') # type: List[int] column_filters = list() for param, value in request.GET.items(): # type: str, str if param.startswith(self.URL_PARAM_PREFIX_FILTER): column_filters.append((param[len(self.URL_PARAM_PREFIX_FILTER):], value)) order_by = request.GET.get('order_by') or None # type: str order_by = parse_order_by(order_by) if order_by else None save_filter = as_bool(request.GET, 'save_filter', False) # type: bool return_reviewed = as_bool(request.GET, 'return_reviewed', False) return_total = as_bool(request.GET, 'return_total', True) return_data = as_bool(request.GET, 'return_data', True) ignore_errors = as_bool(request.GET, 'ignore_errors', True) if project_ids and save_filter: column_filters_dict = {c: f for c, f in column_filters} for project_id in project_ids: with transaction.atomic(): obj = SavedFilter.objects.create(user=request.user, document_type=document_type, filter_type=FT_USER_DOC_GRID_CONFIG, project_id=project_id, columns=columns, column_filters=column_filters_dict, title=None, order_by=[(column, direction.value) for column, direction in order_by] if order_by else None ) SavedFilter.objects.filter(user=request.user, document_type=document_type, filter_type=FT_USER_DOC_GRID_CONFIG, project_id=project_id) \ .exclude(pk=obj.pk) \ .delete() query_results = query_documents(requester=request.user, document_type=document_type, project_ids=project_ids, column_names=columns, saved_filter_ids=saved_filters, column_filters=column_filters, order_by=order_by, offset=offset, limit=limit, return_documents=return_data, return_reviewed_count=return_reviewed, return_total_count=return_total, ignore_errors=ignore_errors, include_annotation_fields=True) # type: DocumentQueryResults if fmt in {self.FMT_XLSX, self.FMT_CSV} and not return_data: raise APIRequestError('Export to csv/xlsx requested with return_data=false') if fmt == self.FMT_CSV: return _query_results_to_csv(query_results) elif fmt == self.FMT_XLSX: return _query_results_to_xlsx(query_results) else: if query_results is None: return Response({'time': time.time() - start}) return _query_results_to_json(query_results, time.time() - start) except APIRequestError as e: return e.to_response() except Exception as e: return APIRequestError(message='Unable to process request', caused_by=e, http_status_code=500).to_response()
def get(self, request, document_type_code: str, *_args, **_kwargs): start = time.time() try: document_type = DocumentType.objects.get(code=document_type_code) project_ids = as_int_list(request.GET, 'project_ids') # type: List[int] columns = as_str_list(request.GET, 'columns') fmt = request.GET.get('fmt') or self.FMT_JSON offset = as_int(request.GET, 'offset', None) if offset is not None and offset < 0: offset = None limit = as_int(request.GET, 'limit', None) if limit is not None and limit <= 0: limit = None # For json output we limit number of returned documents because we dont use streaming response for JSON # and want to keep it fast. if fmt == self.FMT_JSON and ( limit is None or limit > self.MAX_RETURNED_DOCUMENTS_JSON): limit = self.MAX_RETURNED_DOCUMENTS_JSON saved_filters = as_int_list(request.GET, 'saved_filters') # type: List[int] column_filters = list() for param, value in request.GET.items(): # type: str, str if param.startswith(self.URL_PARAM_PREFIX_FILTER): column_filters.append( (param[len(self.URL_PARAM_PREFIX_FILTER):], value)) order_by = request.GET.get('order_by') or None # type: str order_by = parse_order_by(order_by) if order_by else None save_filter = as_bool(request.GET, 'save_filter', False) # type: bool return_reviewed = as_bool(request.GET, 'return_reviewed', False) return_total = as_bool(request.GET, 'return_total', True) return_data = as_bool(request.GET, 'return_data', True) ignore_errors = as_bool(request.GET, 'ignore_errors', True) if project_ids and save_filter: column_filters_dict = {c: f for c, f in column_filters} for project_id in project_ids: SavedFilter.objects.update_or_create( user=request.user, document_type=document_type, filter_type=FT_USER_DOC_GRID_CONFIG, project_id=project_id, defaults={ 'user': request.user, 'document_type': document_type, 'filter_type': FT_USER_DOC_GRID_CONFIG, 'project_id': project_id, 'columns': columns, 'column_filters': column_filters_dict, 'title': None, 'order_by': [(column, direction.value) for column, direction in order_by] if order_by else None }) query_results = get_documents( requester=request.user, document_type=document_type, project_ids=project_ids, column_names=columns, saved_filter_ids=saved_filters, column_filters=column_filters, order_by=order_by, offset=offset, limit=limit, return_documents=return_data, return_reviewed_count=return_reviewed, return_total_count=return_total, ignore_errors=ignore_errors) # type: DocumentQueryResults if fmt.lower() == 'csv': if not return_data: raise APIRequestError( 'Export to csv requested with return_data=false') else: resp = StreamingHttpResponse(csv_gen( query_results.column_codes, query_results.fetch(), query_results.column_titles), content_type='text/csv') resp[ 'Content-Disposition'] = 'attachment; filename="export.csv"' return resp else: if query_results is None: return Response({'time': time.time() - start}) # As we limit the number of returned documents for JSON we can keep response in non-streaming form. return Response( _query_results_to_json(query_results, time.time() - start)) # Switch to StreamingHttpResponse if/when we really need to return very big json output. # _query_results_to_json() returns dict with document items backed with a generator. # But on local tests for small number of documents the streaming json output works two times # slower than non-streaming response. CSV works the same fast. # return StreamingHttpResponse(json_gen(_query_results_to_json(query_results, time.time() - start)), # content_type='application/json') except APIRequestError as e: return e.to_response() except Exception as e: return APIRequestError(message='Unable to process request', caused_by=e, http_status_code=500).to_response()
def get(self, request, document_type_code: str, *_args, **_kwargs): start = time.time() try: document_type = DocumentType.objects.get(code=document_type_code) project_ids = as_int_list(request.GET, 'project_ids') # type: List[int] columns = as_str_list(request.GET, 'columns') include_annotations = as_bool(request.GET, 'associated_text') if include_annotations: all_annotation_columns = get_annotation_columns(document_type) columns += [ i.field_code for i in all_annotation_columns if i.field_code.rstrip(FIELD_CODE_ANNOTATION_SUFFIX) in columns ] fmt = request.GET.get('fmt') or self.FMT_JSON as_zip = request.GET.get('as_zip') == 'true' offset = as_int(request.GET, 'offset', None) if offset is not None and offset < 0: offset = None limit = as_int(request.GET, 'limit', None) if limit is not None and limit <= 0: limit = None # For json output we limit number of returned documents because we dont use streaming response for JSON # and want to keep it fast. if fmt == self.FMT_JSON and self.MAX_RETURNED_DOCUMENTS_JSON is not None \ and (limit is None or limit > self.MAX_RETURNED_DOCUMENTS_JSON): limit = self.MAX_RETURNED_DOCUMENTS_JSON saved_filters = as_int_list(request.GET, 'saved_filters') # type: List[int] column_filters = list() for param, value in request.GET.items(): # type: str, str if param.startswith(self.URL_PARAM_PREFIX_FILTER): column_filters.append( (param[len(self.URL_PARAM_PREFIX_FILTER):], value)) # in case if filter params are passed like &filters=a=b&c=d filter_query_string = request.GET.get('filters') if filter_query_string: for param, value in ast.literal_eval( filter_query_string).items(): # type: str, str if param.startswith(self.URL_PARAM_PREFIX_FILTER): column_filters.append( (param[len(self.URL_PARAM_PREFIX_FILTER):], value)) order_by = request.GET.get('order_by') or None # type: str order_by = parse_order_by(order_by) if order_by else None save_filter = as_bool(request.GET, 'save_filter', False) # type: bool return_reviewed = as_bool(request.GET, 'return_reviewed', False) return_total = as_bool(request.GET, 'return_total', True) return_data = as_bool(request.GET, 'return_data', True) ignore_errors = as_bool(request.GET, 'ignore_errors', True) if project_ids and save_filter: column_filters_dict = {c: f for c, f in column_filters} for project_id in project_ids: with transaction.atomic(): obj = SavedFilter.objects.create( user=request.user, document_type=document_type, filter_type=FT_USER_DOC_GRID_CONFIG, project_id=project_id, columns=columns, column_filters=column_filters_dict, title=None, order_by=[(column, direction.value) for column, direction in order_by] if order_by else None) SavedFilter.objects.filter(user=request.user, filter_type=FT_USER_DOC_GRID_CONFIG, project_id=project_id) \ .exclude(pk=obj.pk) \ .delete() # show_unprocessed = as_bool(request.GET, 'show_unprocessed', False) # if show_unprocessed is False: # column_filters.append((FIELD_CODE_DOC_PROCESSED, 'true')) total_documents_query = Document.objects.filter( document_type=document_type) if project_ids: total_documents_query = total_documents_query.filter( project_id__in=project_ids) total_documents_of_type = total_documents_query.count() columns_to_query = columns if columns_to_query: columns_to_query = leave_unique_values( ['document_id', 'document_name'] + columns) query_results = query_documents( requester=request.user, document_type=document_type, project_ids=project_ids, column_names=columns_to_query, # columns, saved_filter_ids=saved_filters, column_filters=column_filters, order_by=order_by, offset=offset, limit=limit, return_documents=return_data, return_reviewed_count=return_reviewed, return_total_count=return_total, ignore_errors=ignore_errors, include_annotation_fields=True) # type: DocumentQueryResults if query_results is None: if fmt in {self.FMT_XLSX, self.FMT_CSV} and not return_data: raise APIRequestError('Empty data, nothing to export') return Response({'time': time.time() - start}) # get assignees stats assignees_query_results = query_documents( requester=request.user, document_type=document_type, project_ids=project_ids, column_names=['document_id', 'assignee_name', 'assignee_id'], saved_filter_ids=saved_filters, column_filters=column_filters, return_documents=True, return_reviewed_count=False, include_annotation_fields=include_annotations ) # type: DocumentQueryResults query_results.assignees = [] if assignees_query_results is not None: df = pd.DataFrame(assignees_query_results.fetch_dicts()) if not df.empty: df = df.groupby(['assignee_id', 'assignee_name'])\ .agg({'document_id': [('document_ids', lambda x: list(x)), ('documents_count', 'count')]}) if not df.empty: df.columns = df.columns.droplevel() df = df.reset_index() df['assignee_id'] = df['assignee_id'].astype(int) query_results.assignees = df.to_dict('records') query_results.unfiltered_count = total_documents_of_type if fmt in {self.FMT_XLSX, self.FMT_CSV} and not return_data: raise APIRequestError( 'Export to csv/xlsx requested with return_data=false') if fmt == self.FMT_CSV: return query_results.to_csv(as_zip=as_zip) elif fmt == self.FMT_XLSX: return query_results.to_xlsx(as_zip=as_zip) else: query_dict = query_results.to_json(time_start=start) if columns and 'items' in query_dict: columns_to_remove = [] if 'document_id' not in columns: columns_to_remove.append('document_id') query_dict['items'] = self.expand_items( query_dict['items'], columns_to_remove) return Response(query_dict) except APIRequestError as e: return e.to_response() except Exception as e: return APIRequestError(message='Unable to process request', caused_by=e, http_status_code=500).to_response()