def create_test_dump_file(self): obj = HttpResourceMock.objects.get(id=1) queryset = HttpResourceMock.objects.filter(id__in=[2, 3, 4, 5, 6]) dump_path = get_dumps_path(obj) with open(os.path.join(dump_path, "read-dump-test.json"), "w") as fd: object_to_disk(obj, fd) queryset_to_disk(queryset, fd, batch_size=2)
def handle_label(self, dataset_label, **options): dataset = Dataset.objects.get(name=dataset_label) destination = get_dumps_path(dataset) if not os.path.exists(destination): os.makedirs(destination) dataset_file = os.path.join( destination, "{}.{}.json".format(dataset.name, dataset.id)) with open(dataset_file, "w") as json_file: object_to_disk(dataset, json_file) queryset_to_disk(dataset.harvestsource_set, json_file) queryset_to_disk(dataset.harvest_set, json_file) queryset_to_disk(dataset.versions.filter(is_current=True), json_file) for version in dataset.versions.filter(is_current=True): queryset_to_disk(version.indices, json_file) queryset_to_disk(version.collection_set, json_file) queryset_to_disk(version.document_set, json_file) queryset_to_disk(Extension.objects.all(), json_file) resource_files = self.dump_resources() # Sync files with AWS if environment.env != "localhost": logger.info("Uploading files to AWS") ctx = Context(environment) harvester_data_bucket = f"s3://{environment.aws.harvest_content_bucket}/datasets/harvester" for file in [dataset_file] + resource_files: remote_file = harvester_data_bucket + file.replace( settings.DATAGROWTH_DATA_DIR, "", 1) ctx.run(f"aws s3 cp {file} {remote_file}", echo=True)
def get_datasets(self): datasets = [] for entry in os.scandir(get_dumps_path(self.model)): if entry.is_file() and not entry.name.startswith("."): instance = self.model() instance.signature = entry.name[:-5] datasets.append(instance) return datasets
def test_get_dumps_path(self): instance = HttpResourceMock() instance.get_name = Mock(return_value="name") path = get_dumps_path(instance) self.assertEqual( path, os.path.join(datagrowth_settings.DATAGROWTH_DATA_DIR, "resources", "dumps", "name")) self.assertTrue(instance.get_name.called)
def handle_label(self, dataset_label, **options): skip_download = options["skip_download"] harvest_source = options.get("harvest_source", None) should_index = options.get("index") download_edurep = options["download_edurep"] assert harvest_source or environment.env != "localhost", \ "Expected a harvest source argument for a localhost environment" source_environment = create_configuration(harvest_source, service="harvester") \ if harvest_source else environment # Delete old datasets dataset = Dataset.objects.filter(name=dataset_label).last() if dataset is not None: dataset.harvestsource_set.all().delete() dataset.harvest_set.all().delete() dataset.delete() Extension.objects.all().delete() if harvest_source and not skip_download: logger.info(f"Downloading dump file for: {dataset_label}") ctx = Context(environment) harvester_data_bucket = f"s3://{source_environment.aws.harvest_content_bucket}/datasets/harvester" download_edurep = options["download_edurep"] if download_edurep: ctx.run( f"aws s3 sync {harvester_data_bucket} {settings.DATAGROWTH_DATA_DIR}" ) else: ctx.run( f"aws s3 sync {harvester_data_bucket} {settings.DATAGROWTH_DATA_DIR} --exclude *edurepoaipmh*" ) logger.info(f"Importing dataset: {dataset_label}") for entry in os.scandir(get_dumps_path(Dataset)): if entry.is_file() and entry.name.startswith(dataset_label): dataset_file = entry.path break else: raise CommandError( f"Can't find a dump file for label: {dataset_label}") # Process dump file with open(dataset_file, "r") as dump_file: for objects in objects_from_disk(dump_file): self.bulk_create_objects(objects) # Load resources self.load_resources(download_edurep) self.reset_postgres_sequences() # Index data if should_index: latest_dataset_version = DatasetVersion.objects.get_current_version( ) call_command("index_dataset_version", dataset=latest_dataset_version.dataset.name, harvester_version=latest_dataset_version.version)
def handle_label(self, label, **options): Resource = apps.get_model(label) destination = get_dumps_path(Resource) if not os.path.exists(destination): os.makedirs(destination) resource_name = Resource.get_name() file_path = os.path.join(destination, "{}.dump.json".format(resource_name)) with open(file_path, "w") as dump_file: queryset_to_disk(Resource.objects, dump_file)
def handle_dataset(self, dataset, *args, **options): destination = get_dumps_path(dataset) if not os.path.exists(destination): os.makedirs(destination) file_name = os.path.join(destination, "{}.json".format(dataset.signature)) with open(file_name, "w") as json_file: self.object_to_disk(dataset, json_file) self.queryset_to_disk(dataset.growth_set, json_file) self.queryset_to_disk(dataset.collective_set, json_file) self.queryset_to_disk(dataset.individual_set, json_file)
def handle_label(self, label, **options): Resource = apps.get_model(label) source = get_dumps_path(Resource) resource_name = Resource.get_name() file_path = os.path.join(source, "{}.dump.json".format(resource_name)) if not os.path.exists(file_path): log.error("Resource dump {} does not exist".format(file_path)) exit(1) with open(file_path, "r") as dump_file: for objects in objects_from_disk(dump_file): Resource.objects.bulk_create(objects)
def dump_resources(self): paths = [] for resource_model in self.resources: clazz = apps.get_model(resource_model) dump_file = os.path.join(get_dumps_path(clazz), f"{clazz.get_name()}.dump.json") paths.append(dump_file) print(f"Dumping {clazz.get_name()} to {dump_file}") call_command("dump_resource", resource_model) return paths
def handle_dataset(self, dataset, *args, **options): setattr(dataset, "current_growth", None) # resets the dataset destination = get_dumps_path(dataset) if not os.path.exists(destination): os.makedirs(destination) file_name = os.path.join( destination, "{}.{}.json".format(dataset.signature, dataset.id)) with open(file_name, "w") as json_file: object_to_disk(dataset, json_file) queryset_to_disk(dataset.growth_set, json_file) queryset_to_disk(dataset.collections, json_file) queryset_to_disk(dataset.documents, json_file)
def get_datasets(self): datasets = [] for entry in os.scandir(get_dumps_path(self.model)): if entry.is_file() and not entry.name.startswith("."): instance = self.model() file_match = re.search( "(?P<signature>.+?)\.?(?P<pk>\d+)?\.json$", entry.name) file_info = file_match.groupdict() instance.signature = file_info["signature"] instance.file_path = entry.path # this property gets added especially for the command datasets.append(instance) return datasets
def handle_community(self, community, *args, **options): source = get_dumps_path(community) file_name = os.path.join(source, "{}.json".format(self.signature)) if not os.path.exists(file_name): print("Dump with signature {} does not exist".format(self.signature)) exit(1) with open(file_name, "r") as dump_file: batch_count = 0 for _ in dump_file.readlines(): batch_count += 1 dump_file.seek(0) for line in tqdm(dump_file, total=batch_count): objects = [wrapper.object for wrapper in deserialize("json", line)] self.bulk_create_objects(objects)
def handle_label(self, freeze_label, **options): freeze = Freeze.objects.get(name=freeze_label) destination = get_dumps_path(freeze) if not os.path.exists(destination): os.makedirs(destination) file_name = os.path.join(destination, "{}.{}.json".format(freeze.name, freeze.id)) with open(file_name, "w") as json_file: object_to_disk(freeze, json_file) queryset_to_disk(freeze.edurepsource_set, json_file) queryset_to_disk(freeze.edurepharvest_set, json_file) queryset_to_disk(freeze.indices, json_file) queryset_to_disk(freeze.collection_set, json_file) queryset_to_disk(freeze.arrangement_set, json_file) queryset_to_disk(freeze.document_set, json_file) self.dump_resources()
def handle_dataset(self, dataset, *args, **options): # transform_community = options.get("transform_community", False) # if transform_community: # self.Document = apps.get_model(dataset._meta.app_label, "Document") # self.Individual = apps.get_model("core", "Individual") # self.Collection = apps.get_model(dataset._meta.app_label, "Collection") # self.Collective = apps.get_model("core", "Collective") source = get_dumps_path(dataset) file_name = os.path.join(source, "{}.json".format(dataset.signature)) if not os.path.exists(file_name): log.error("Dump with signature {} does not exist".format(dataset.signature)) exit(1) with open(file_name, "r") as dump_file: batch_count = 0 for _ in dump_file.readlines(): batch_count += 1 dump_file.seek(0) for line in tqdm(dump_file, total=batch_count): objects = [wrapper.object for wrapper in deserialize("json", line)] self.bulk_create_objects(objects, False)
def get_file_path(self, mode): dump_path = get_dumps_path(HttpResourceMock) file_name = "{}-dump-test.json".format(mode) return os.path.join(dump_path, file_name)