def test_dump_restore(log, spec, releases, tmpdir): for _ in spec.process_items(releases): pass spec.dump(tmpdir / "result.json") spec2 = DataPreprocessor.restore(tmpdir / "result.json") for name, table in spec.tables.items(): assert table == spec2.tables[name] for key in ( "schema", "root_tables", "combined_tables", "header_separator", "tables", "table_threshold", "total_items", ): assert key in spec2.__dict__ with patch("builtins.open", mock_open(read_data="invalid")): spec2 = DataPreprocessor.restore(tmpdir / "result.json") log.assert_has_calls([call("Invalid pickle file. Can't restore.")]) with patch("builtins.open", mock_open(read_data=b"invalid")): spec2 = DataPreprocessor.restore(tmpdir / "result.json") log.assert_has_calls([call("Invalid pickle file. Can't restore.")])
def create(self, request, *args, upload_id=None, url_id=None): data = request.data or request.POST kind = data.get("kind", DataSelection.CUSTOM) headings_type = DataSelection.OCDS if kind != DataSelection.OCDS_LITE: serializer = self.get_serializer_class()(data=data) if serializer.is_valid(): datasource = Url.objects.get( id=url_id) if url_id else Upload.objects.get(id=upload_id) selection = DataSelection.objects.create( kind=kind, headings_type=headings_type) spec = DataPreprocessor.restore(datasource.analyzed_file.path) for table in serializer.data["tables"]: _table = Table.objects.create(**table) _table.should_split = spec[_table.name].splitted _table.save() selection.tables.add(_table) datasource.selections.add(selection) return Response(self.get_serializer_class()(selection).data, status=status.HTTP_201_CREATED) else: return Response({"detail": serializer.errors}, status=status.HTTP_400_BAD_REQUEST) else: datasource = Url.objects.get( id=url_id) if url_id else Upload.objects.get(id=upload_id) if not datasource.available_tables: return Response( {"detail": _("Datasource without available tables")}, status=status.HTTP_400_BAD_REQUEST) lang_code = get_language() lang_prefix = lang_code.split("-")[0] headings_type = f"{lang_prefix}_user_friendly" selection = DataSelection.objects.create( kind=kind, headings_type=headings_type) spec = DataPreprocessor.restore(datasource.analyzed_file.path) for available_table in datasource.available_tables: if available_table["name"] in OCDS_LITE_CONFIG["tables"]: _name = available_table["name"] _split = OCDS_LITE_CONFIG["tables"][_name].get( "split", False) _table = Table.objects.create(name=_name, split=_split) child_tables_data = spec.tables[_name].child_tables if _split and child_tables_data: for child_table in child_tables_data: _include = ( False if child_table not in OCDS_LITE_CONFIG["tables"][_name].get( "child_tables", {}) else True) _child_table = Table.objects.create( name=child_table, include=_include) _table.array_tables.add(_child_table) selection.tables.add(_table) datasource.selections.add(selection) return Response(self.get_serializer_class()(selection).data, status=status.HTTP_201_CREATED)
def __init__( self, workdir, schema=None, state_file=None, root_tables=ROOT_TABLES, combined_tables=COMBINED_TABLES, pkg_type="releases", language=LOCALE, table_threshold=TABLE_THRESHOLD, ): self.workdir = Path(workdir) self.multiple_values = False self.schema = schema self.root_tables = root_tables self.combined_tables = combined_tables self.language = language self.table_threshold = table_threshold if state_file: self.spec = DataPreprocessor.restore(state_file) self.sort_tables() else: self.spec = None self.pkg_type = pkg_type self.order = None
def update(self, request, *args, **kwargs): try: if "url_id" in kwargs: datasource = Url.objects.get(id=kwargs["url_id"]) elif "upload_id" in kwargs: datasource = Upload.objects.get(id=kwargs["upload_id"]) table = Table.objects.get(id=kwargs["id"]) spec = DataPreprocessor.restore(datasource.analyzed_file.path) update_fields = [] for key in ("split", "include", "heading"): if key in request.data: setattr(table, key, request.data[key]) update_fields.append(key) if update_fields: table.save(update_fields=update_fields) is_array_tables = len(table.array_tables.all()) if "split" in request.data and request.data[ "split"] and not is_array_tables: child_tables = spec.tables[table.name].child_tables self._split_table(table, spec.tables, datasource, child_tables) serializer = self.get_serializer_class()(table) sources = table.dataselection_set.all() or table.array_tables.all( )[0].dataselection_set.all() if sources: sources[0].flattens.all().delete() return Response(serializer.data) except FileNotFoundError as e: extra = { "MESSAGE_ID": "update_table_failed", "DATASOURCE_ID": str(datasource.id), "TABLE_ID": kwargs["id"], "ERROR_MSG": str(e), "EXPIRED_AT": datasource.expired_at.isoformat(), } logger.info("Error while update table %s" % str(e), extra=extra) return Response({"detail": _("Datasource expired.")}, status=status.HTTP_404_NOT_FOUND) except OSError as e: extra = { "MESSAGE_ID": "update_table_failed", "DATASOURCE_ID": str(datasource.id), "TABLE_ID": kwargs["id"], "ERROR_MSG": str(e), } logger.info("Error while update table %s" % str(e), extra=extra) return Response( { "detail": _("Currently, the space limit was reached. Please try again later." ) }, status=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, )
def set_column_headings(selection, analyzed_file_path): current_language_code = get_language() spec = DataPreprocessor.restore(analyzed_file_path) if selection.headings_type.startswith("es"): activate("es") for table in selection.tables.all(): table.column_headings = get_column_headings(selection, spec, table) table.save(update_fields=["column_headings"]) if table.split: for a_table in table.array_tables.all(): a_table.column_headings = get_column_headings( selection, spec, a_table) a_table.save(update_fields=["column_headings"]) activate(current_language_code)
def get_flatten_options(selection): selections = {} exclude_tables_list = [] spec = None if selection.kind == selection.OCDS_LITE: datasource = selection.url_set.all() or selection.upload_set.all() spec = DataPreprocessor.restore(datasource[0].analyzed_file.path) get_options_for_table(selections, exclude_tables_list, selection, selection.tables, analyzed_data=spec) options = {"selection": selections} if exclude_tables_list: options["exclude"] = exclude_tables_list return options
def __init__( self, workdir, schema=None, state_file=None, root_tables=ROOT_TABLES, combined_tables=COMBINED_TABLES, root_key="releases", language=LOCALE, table_threshold=TABLE_THRESHOLD, ): self.workdir = Path(workdir) if state_file: self.spec = DataPreprocessor.restore(state_file) else: self.spec = DataPreprocessor( schema, root_tables, combined_tables=combined_tables, language=language, table_threshold=table_threshold, ) self.root_key = root_key
def available_tables(): spec = DataPreprocessor.restore(ANALYZED_DATA_PATH) # with open(ANALYZED_DATA_PATH) as fd: # data = json.loads(fd.read()) _available_tables, unavailable_tables = retrieve_tables(spec) return _available_tables, unavailable_tables
def list(self, request, url_id=None, upload_id=None, selection_id=None, table_id=None): table = Table.objects.get(id=table_id) if url_id: datasource = Url.objects.get(id=url_id) elif upload_id: datasource = Upload.objects.get(id=upload_id) datasource_dir = os.path.dirname(datasource.file.path) selection = DataSelection.objects.get(id=selection_id) try: spec = DataPreprocessor.restore(datasource.analyzed_file.path) data = [] if table.split: preview_path = f"{datasource_dir}/{table.name}.csv" if not os.path.exists(preview_path): store_preview_csv(COLUMNS, PREVIEW_ROWS, spec.tables[table.name], preview_path) with open(preview_path) as csvfile: preview = { "name": spec.tables[table.name].name, "id": str(table.id), "preview": csvfile.read(), "heading": table.heading, } if selection.headings_type != selection.OCDS: preview["column_headings"] = table.column_headings data.append(preview) for child_table in table.array_tables.all(): if not child_table.include: continue preview_path = f"{datasource_dir}/{child_table.name}_combined.csv" with open(preview_path) as csvfile: preview = { "name": spec.tables[child_table.name].name, "id": str(child_table.id), "preview": csvfile.read(), "heading": child_table.heading, } if selection.headings_type != selection.OCDS: preview[ "column_headings"] = child_table.column_headings data.append(preview) else: preview_path = f"{datasource_dir}/{table.name}_combined.csv" if not os.path.exists(preview_path): store_preview_csv(COMBINED_COLUMNS, COMBINED_PREVIEW_ROWS, spec.tables[table.name], preview_path) with open(preview_path) as csvfile: preview = { "name": spec.tables[table.name].name, "id": str(table.id), "preview": csvfile.read(), "heading": table.heading, } if selection.headings_type != selection.OCDS: preview["column_headings"] = table.column_headings data.append(preview) return Response(data) except FileNotFoundError as e: extra = { "MESSAGE_ID": "get_preview_failed", "DATASOURCE_ID": str(datasource.id), "TABLE_ID": table_id, "ERROR_MSG": str(e), "EXPIRED_AT": datasource.expired_at.isoformat(), } logger.info("Error while get table preview %s" % str(e), extra=extra) return Response({"detail": _("Datasource expired.")}, status=status.HTTP_404_NOT_FOUND) except OSError as e: extra = { "MESSAGE_ID": "create_preview_failed", "DATASOURCE_ID": str(datasource.id), "TABLE_ID": table_id, "ERROR_MSG": str(e), } logger.info("Error while create preview %s" % str(e), extra=extra) return Response( { "detail": _("Currently, the space limit was reached. Please try again later." ) }, status=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, )
def validate_data(object_id, model=None, lang_code="en"): with internationalization(lang_code=lang_code): logger_context = {"DATASOURCE_ID": object_id, "TASK": "validate_data"} ds_model, serializer = get_serializer_by_model(model, logger_context) channel_layer = get_channel_layer() if not ds_model: async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Model %s for datasource not found") % model }, ) return try: is_valid = False datasource = ds_model.objects.get(id=object_id) datasource.status = "validation" datasource.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) logger.debug("Start validation for %s file" % object_id) with open(SCHEMA_PATH) as fd: schema = json.loads(fd.read()) resource = "" if is_release_package(datasource.file.path): resource = "releases" elif is_record_package(datasource.file.path): resource = "records" if resource: path = pathlib.Path(datasource.file.path) workdir = path.parent filename = path.name total = path.stat().st_size analyzer = FileAnalyzer(workdir, schema=schema, root_key=resource, root_tables=ROOT_TABLES, combined_tables=COMBINED_TABLES) timestamp = time.time() for read, count in analyzer.analyze_file(filename, with_preview=True): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": { "id": str(datasource.id) }, "progress": { "rows": count, "percentage": (read / total) * 100 if total else 0, "size": total, "read": read, }, }, ) timestamp = time.time() is_valid = True datasource.validation.is_valid = is_valid datasource.root_key = resource datasource.validation.save(update_fields=["is_valid"]) datasource.save(update_fields=["root_key"]) if is_valid and not datasource.available_tables and not datasource.analyzed_file: _file = ContentFile(b"") datasource.analyzed_file.save("new", _file) analyzer.spec.dump(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables( analyzer.spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) elif is_valid and datasource.analyzed_file: spec = DataPreprocessor.restore(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables(spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found" % (model, object_id), extra=logger_context) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Datasource %s not found") % object_id }, ) except (ijson.JSONError, ijson.IncompleteJSONError) as e: logger.info( "Error while validating data %s" % object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_ERROR": str(e) }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except OSError as e: logger.exception( "Error while validating data %s" % object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_ERROR": str(e) }, ) message = _( "Currently, the space limit was reached. Please try again later." ) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except Exception as e: logger.exception( "Error while validating data %s" % object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_ERROR": str(e) }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, )
def flatten_data(flatten_id, model=None, lang_code="en_US"): with internationalization(lang_code=lang_code): logger_context = { "FLATTEN_ID": flatten_id, "TASK": "flatten_data", "MODEL": model } channel_layer = get_channel_layer() if model not in getters: extra = { "MESSAGE_ID": "model_not_registered", "MODEL": model, "TASK": "flatten_data", "FLATTEN_ID": flatten_id, } logger.info("Model %s not registered in getters" % model, extra=extra) return try: serializer = FlattenSerializer() flatten = Flatten.objects.get(id=flatten_id) selection = flatten.dataselection_set.all()[0] datasource = getattr(selection, f"{model.lower()}_set").all()[0] flatten.status = "processing" flatten.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) spec = DataPreprocessor.restore(datasource.analyzed_file.path) total_rows = spec.total_items opt = get_flatten_options(selection) logger.debug( "Generate options for export", extra={ "MESSAGE_ID": "generate_flatten_options", "DATASOURCE_ID": str(datasource.id), "MODEL": model, "SELECTION_ID": str(selection.id), "FLATTEN_ID": str(flatten.id), "OPTIONS": opt, }, ) options = FlattenOptions(**opt) workdir = pathlib.Path(datasource.file.path).parent formats = {"csv": None, "xlsx": None} if flatten.export_format == flatten.CSV: workdir = workdir / "export" if not workdir.exists(): os.makedirs(workdir) formats[flatten.export_format] = workdir else: formats[flatten.export_format] = "result.xlsx" flattener = FileFlattener(workdir, options, spec.tables, root_key=datasource.root_key, **formats) timestamp = time.time() for count in flattener.flatten_file(datasource.file.path): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": { "id": str(flatten.id) }, "progress": { "total_rows": total_rows, "processed": count, "percentage": (count / total_rows) * 100 if total_rows else total_rows, }, }, ) timestamp = time.time() if flatten.export_format == flatten.CSV: target_file = f"{workdir}/{datasource.id}.zip" zip_files(workdir, target_file, extension="csv") with open(target_file, "rb") as fd: file_ = File(fd) file_.name = f"{datasource.id}.zip" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) else: target_file = f"{workdir}/result.xlsx" with open(target_file, "rb") as fd: file_ = File(fd) file_.name = "result.xlsx" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except ObjectDoesNotExist: extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_not_found" logger.info("Flatten %s for %s model not found" % (flatten_id, model), extra=extra) except OSError as e: extra = deepcopy(logger_context) extra.update({ "MESSAGE_ID": "flatten_no_left_space", "DATASOURCE_ID": str(datasource.id), "ERROR_MSG": str(e) }) logger.info("Flatten %s for %s model failed: %s" % (flatten_id, model, e), extra=extra) flatten.status = "failed" flatten.error = _( "Currently, the space limit was reached. Please try again later." ) flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except (TypeError, Exception) as e: error_message = str(e) extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_failed" extra["ERROR_MESSAGE"] = error_message logger.error( "Flatten %s for %s datasource %s failed" % (flatten_id, model, datasource.id), extra=extra, exc_info=True, ) flatten.status = "failed" flatten.error = error_message flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, )
def update(self, request, *args, **kwargs): try: datasource = (Url.objects.get(id=kwargs["url_id"]) if "url_id" in kwargs else Upload.objects.get( id=kwargs["upload_id"])) table = Table.objects.get(id=kwargs["id"]) spec = DataPreprocessor.restore(datasource.analyzed_file.path) update_fields = [] for key in ("split", "include", "heading"): if key in request.data: setattr(table, key, request.data[key]) # Remove "grandchildren" (child tables of child tables) if such are present if key in ("split", "include") and request.data[key] is False: if table.array_tables and not table.parent: for array_table in list(table.array_tables.all()): setattr(array_table, key, False) array_table.save() if table.array_tables and table.parent: parent = table.array_tables.all()[0] for array_table in list(parent.array_tables.all()): setattr( array_table, key, False if array_table.parent == table.name else getattr(array_table, key), ) array_table.save() # Forbid merge of table if any of child arrays is unmergeable if (key == "split" and request.data[key] is False and table.array_tables and False in [ _table.mergeable for _table in list(table.array_tables.all()) if _table.include is True ]): return Response( { "detail": _("Cannot merge '%(table_name)s' - child arrays are too large" ) % { "table_name": table.name } }, status=status.HTTP_400_BAD_REQUEST, ) update_fields.append(key) if update_fields: table.save(update_fields=update_fields) is_array_tables = len(table.array_tables.all()) if "split" in request.data and request.data[ "split"] and not is_array_tables: child_tables = spec.tables[table.name].child_tables self._split_table(table, spec.tables, datasource, child_tables) serializer = self.get_serializer_class()(table) sources = table.dataselection_set.all() or table.array_tables.all( )[0].dataselection_set.all() if sources: sources[0].flattens.all().delete() return Response(serializer.data) except FileNotFoundError as e: extra = { "MESSAGE_ID": "update_table_failed", "DATASOURCE_ID": str(datasource.id), "TABLE_ID": kwargs["id"], "ERROR_MSG": str(e), "EXPIRED_AT": datasource.expired_at.isoformat(), } logger.info("Error while update table %s" % str(e), extra=extra) return Response({"detail": _("Datasource expired.")}, status=status.HTTP_404_NOT_FOUND) except OSError as e: extra = { "MESSAGE_ID": "update_table_failed", "DATASOURCE_ID": str(datasource.id), "TABLE_ID": kwargs["id"], "ERROR_MSG": str(e), } logger.info("Error while update table %s" % str(e), extra=extra) return Response( { "detail": _("Currently, the space limit was reached. Please try again later." ) }, status=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, )
def validate_data(object_id, model=None, lang_code="en"): with internationalization(lang_code=lang_code): logger_context = {"DATASOURCE_ID": object_id, "TASK": "validate_data"} ds_model, serializer = get_serializer_by_model(model, logger_context) channel_layer = get_channel_layer() if not ds_model: async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Model %s for datasource not found") % model }, ) return try: is_valid = False datasource = ds_model.objects.get(id=object_id) datasource.status = "validation" datasource.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) logger.debug("Start validation for %s file", object_id) paths = [ pathlib.Path(file.file.path) for file in datasource.files.all() ] workdir = paths[0].parent filenames = [pathlib.Path(path).name for path in paths] total = sum([ pathlib.Path(path).stat().st_size if get_reader(path) == open else gz_size(path) for path in paths ]) analyzer = FileAnalyzer(workdir, root_tables=ROOT_TABLES, combined_tables=COMBINED_TABLES) timestamp = time.time() filepaths = [workdir / filename for filename in filenames] for read, count in analyzer.analyze_file(filepaths, with_preview=True): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": { "id": str(datasource.id) }, "progress": { "rows": count, "percentage": (read / total) * 100 if total else 0, "size": total, "read": read, }, }, ) timestamp = time.time() is_valid = True datasource.validation.is_valid = is_valid datasource.root_key = analyzer.pkg_type datasource.validation.save(update_fields=["is_valid"]) datasource.order = ", ".join(analyzer.order) datasource.save() if is_valid and not datasource.available_tables and not datasource.analyzed_file: _file = ContentFile(b"") datasource.analyzed_file.save("new", _file) analyzer.spec.dump(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables( analyzer.spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) elif is_valid and datasource.analyzed_file: spec = DataPreprocessor.restore(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables(spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found", model, object_id, extra=logger_context) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Datasource %s not found") % object_id }, ) except (ijson.JSONError, ijson.IncompleteJSONError) as e: logger.info( "Error while validating data %s", object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_EXCEPTION": e.__class__.__name__, "STR_ERROR": str(e), }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except OSError as e: logger.exception( "Error while validating data %s", object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_EXCEPTION": e.__class__.__name__, "STR_ERROR": str(e), }, ) message = (_( "Currently, the space limit was reached. Please try again later." ) if "[Errno 28]" in str(e) else _( "Something went wrong during processing of your file, please contact support" )) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except Exception as e: logger.exception( "Error while validating data %s", object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_EXCEPTION": e.__class__.__name__, "STR_ERROR": str(e), }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, )
def flatten_data(flatten_id, model=None, lang_code="en_US"): with internationalization(lang_code=lang_code): logger_context = { "FLATTEN_ID": flatten_id, "TASK": "flatten_data", "MODEL": model } channel_layer = get_channel_layer() if model not in getters: extra = { "MESSAGE_ID": "model_not_registered", "MODEL": model, "TASK": "flatten_data", "FLATTEN_ID": flatten_id, } logger.info("Model %s not registered in getters", model, extra=extra) return try: serializer = FlattenSerializer() flatten = Flatten.objects.get(id=flatten_id) selection = flatten.dataselection_set.all()[0] datasource = getattr(selection, f"{model.lower()}_set").all()[0] flatten.status = "processing" flatten.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) spec = DataPreprocessor.restore(datasource.analyzed_file.path) total_rows = spec.total_items opt = get_flatten_options(selection) # In case of exclusion of child tables, 'split' of root table should be set to 'False' for proper export # TODO: There should be better way to handle this (probably on library side) if "exclude" in opt: for _table in opt["exclude"]: _parent = spec.tables[_table].parent if _parent != "" and _parent.name in opt["selection"]: opt["selection"][_parent.name]["split"] = False logger.debug( "Generate options for export", extra={ "MESSAGE_ID": "generate_flatten_options", "DATASOURCE_ID": str(datasource.id), "MODEL": model, "SELECTION_ID": str(selection.id), "FLATTEN_ID": str(flatten.id), "OPTIONS": opt, }, ) options = FlattenOptions(**opt) files = [file.file.path for file in datasource.files.all()] workdir = pathlib.Path(files[0]).parent formats = {"csv": None, "xlsx": None} if flatten.export_format == flatten.CSV: workdir = workdir / "export" if not workdir.exists(): os.makedirs(workdir) formats[flatten.export_format] = workdir else: formats[flatten.export_format] = "result.xlsx" flattener = FileFlattener( workdir, options, tables=spec.tables, pkg_type=datasource.root_key, multiple_values=getattr(spec, "multiple_values", False), schema=spec.schema, **formats, ) timestamp = time.time() for count in flattener.flatten_file(files): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": { "id": str(flatten.id) }, "progress": { "total_rows": total_rows, "processed": count, "percentage": (count / total_rows) * 100 if total_rows else total_rows, }, }, ) timestamp = time.time() if flatten.export_format == flatten.CSV: target_file = f"{workdir}/{datasource.id}.zip" zip_files(workdir, target_file, extension="csv") with open(target_file, "rb") as fd: file_ = File(fd) file_.name = f"{datasource.id}.zip" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) else: target_file = f"{workdir}/result.xlsx" with open(target_file, "rb") as fd: file_ = File(fd) file_.name = "result.xlsx" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except ObjectDoesNotExist: extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_not_found" logger.info("Flatten %s for %s model not found", flatten_id, model, extra=extra) except OSError as e: extra = deepcopy(logger_context) extra.update({ "MESSAGE_ID": "flatten_no_left_space", "DATASOURCE_ID": str(datasource.id), "ERROR_MSG": str(e) }) logger.info("Flatten %s for %s model failed: %s", flatten_id, model, e, extra=extra) flatten.status = "failed" flatten.error = (_( "Currently, the space limit was reached. Please try again later." ) if "[Errno 28]" in str(e) else _( "Something went wrong during processing of your file, please contact support" )) flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except (TypeError, Exception) as e: error_message = str(e) extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_failed" extra["ERROR_MESSAGE"] = error_message logger.error( "Flatten %s for %s datasource %s failed", flatten_id, model, datasource.id, extra=extra, exc_info=True, ) flatten.status = "failed" flatten.error = error_message flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, )