示例#1
0
 def create_test_dump_file(self):
     obj = HttpResourceMock.objects.get(id=1)
     queryset = HttpResourceMock.objects.filter(id__in=[2, 3, 4, 5, 6])
     dump_path = get_dumps_path(obj)
     with open(os.path.join(dump_path, "read-dump-test.json"), "w") as fd:
         object_to_disk(obj, fd)
         queryset_to_disk(queryset, fd, batch_size=2)
示例#2
0
    def handle_label(self, dataset_label, **options):
        dataset = Dataset.objects.get(name=dataset_label)

        destination = get_dumps_path(dataset)
        if not os.path.exists(destination):
            os.makedirs(destination)
        dataset_file = os.path.join(
            destination, "{}.{}.json".format(dataset.name, dataset.id))
        with open(dataset_file, "w") as json_file:
            object_to_disk(dataset, json_file)
            queryset_to_disk(dataset.harvestsource_set, json_file)
            queryset_to_disk(dataset.harvest_set, json_file)
            queryset_to_disk(dataset.versions.filter(is_current=True),
                             json_file)
            for version in dataset.versions.filter(is_current=True):
                queryset_to_disk(version.indices, json_file)
                queryset_to_disk(version.collection_set, json_file)
                queryset_to_disk(version.document_set, json_file)
            queryset_to_disk(Extension.objects.all(), json_file)

        resource_files = self.dump_resources()

        # Sync files with AWS
        if environment.env != "localhost":
            logger.info("Uploading files to AWS")
            ctx = Context(environment)
            harvester_data_bucket = f"s3://{environment.aws.harvest_content_bucket}/datasets/harvester"
            for file in [dataset_file] + resource_files:
                remote_file = harvester_data_bucket + file.replace(
                    settings.DATAGROWTH_DATA_DIR, "", 1)
                ctx.run(f"aws s3 cp {file} {remote_file}", echo=True)
示例#3
0
 def get_datasets(self):
     datasets = []
     for entry in os.scandir(get_dumps_path(self.model)):
         if entry.is_file() and not entry.name.startswith("."):
             instance = self.model()
             instance.signature = entry.name[:-5]
             datasets.append(instance)
     return datasets
示例#4
0
 def test_get_dumps_path(self):
     instance = HttpResourceMock()
     instance.get_name = Mock(return_value="name")
     path = get_dumps_path(instance)
     self.assertEqual(
         path,
         os.path.join(datagrowth_settings.DATAGROWTH_DATA_DIR, "resources",
                      "dumps", "name"))
     self.assertTrue(instance.get_name.called)
    def handle_label(self, dataset_label, **options):

        skip_download = options["skip_download"]
        harvest_source = options.get("harvest_source", None)
        should_index = options.get("index")
        download_edurep = options["download_edurep"]

        assert harvest_source or environment.env != "localhost", \
            "Expected a harvest source argument for a localhost environment"
        source_environment = create_configuration(harvest_source, service="harvester") \
            if harvest_source else environment

        # Delete old datasets
        dataset = Dataset.objects.filter(name=dataset_label).last()
        if dataset is not None:
            dataset.harvestsource_set.all().delete()
            dataset.harvest_set.all().delete()
            dataset.delete()
        Extension.objects.all().delete()

        if harvest_source and not skip_download:
            logger.info(f"Downloading dump file for: {dataset_label}")
            ctx = Context(environment)
            harvester_data_bucket = f"s3://{source_environment.aws.harvest_content_bucket}/datasets/harvester"
            download_edurep = options["download_edurep"]
            if download_edurep:
                ctx.run(
                    f"aws s3 sync {harvester_data_bucket} {settings.DATAGROWTH_DATA_DIR}"
                )
            else:
                ctx.run(
                    f"aws s3 sync {harvester_data_bucket} {settings.DATAGROWTH_DATA_DIR} --exclude *edurepoaipmh*"
                )
        logger.info(f"Importing dataset: {dataset_label}")
        for entry in os.scandir(get_dumps_path(Dataset)):
            if entry.is_file() and entry.name.startswith(dataset_label):
                dataset_file = entry.path
                break
        else:
            raise CommandError(
                f"Can't find a dump file for label: {dataset_label}")

        # Process dump file
        with open(dataset_file, "r") as dump_file:
            for objects in objects_from_disk(dump_file):
                self.bulk_create_objects(objects)
        # Load resources
        self.load_resources(download_edurep)
        self.reset_postgres_sequences()

        # Index data
        if should_index:
            latest_dataset_version = DatasetVersion.objects.get_current_version(
            )
            call_command("index_dataset_version",
                         dataset=latest_dataset_version.dataset.name,
                         harvester_version=latest_dataset_version.version)
示例#6
0
 def handle_label(self, label, **options):
     Resource = apps.get_model(label)
     destination = get_dumps_path(Resource)
     if not os.path.exists(destination):
         os.makedirs(destination)
     resource_name = Resource.get_name()
     file_path = os.path.join(destination,
                              "{}.dump.json".format(resource_name))
     with open(file_path, "w") as dump_file:
         queryset_to_disk(Resource.objects, dump_file)
示例#7
0
 def handle_dataset(self, dataset, *args, **options):
     destination = get_dumps_path(dataset)
     if not os.path.exists(destination):
         os.makedirs(destination)
     file_name = os.path.join(destination, "{}.json".format(dataset.signature))
     with open(file_name, "w") as json_file:
         self.object_to_disk(dataset, json_file)
         self.queryset_to_disk(dataset.growth_set, json_file)
         self.queryset_to_disk(dataset.collective_set, json_file)
         self.queryset_to_disk(dataset.individual_set, json_file)
示例#8
0
 def handle_label(self, label, **options):
     Resource = apps.get_model(label)
     source = get_dumps_path(Resource)
     resource_name = Resource.get_name()
     file_path = os.path.join(source, "{}.dump.json".format(resource_name))
     if not os.path.exists(file_path):
         log.error("Resource dump {} does not exist".format(file_path))
         exit(1)
     with open(file_path, "r") as dump_file:
         for objects in objects_from_disk(dump_file):
             Resource.objects.bulk_create(objects)
示例#9
0
    def dump_resources(self):
        paths = []
        for resource_model in self.resources:
            clazz = apps.get_model(resource_model)
            dump_file = os.path.join(get_dumps_path(clazz),
                                     f"{clazz.get_name()}.dump.json")
            paths.append(dump_file)
            print(f"Dumping {clazz.get_name()} to {dump_file}")
            call_command("dump_resource", resource_model)

        return paths
示例#10
0
 def handle_dataset(self, dataset, *args, **options):
     setattr(dataset, "current_growth", None)  # resets the dataset
     destination = get_dumps_path(dataset)
     if not os.path.exists(destination):
         os.makedirs(destination)
     file_name = os.path.join(
         destination, "{}.{}.json".format(dataset.signature, dataset.id))
     with open(file_name, "w") as json_file:
         object_to_disk(dataset, json_file)
         queryset_to_disk(dataset.growth_set, json_file)
         queryset_to_disk(dataset.collections, json_file)
         queryset_to_disk(dataset.documents, json_file)
示例#11
0
 def get_datasets(self):
     datasets = []
     for entry in os.scandir(get_dumps_path(self.model)):
         if entry.is_file() and not entry.name.startswith("."):
             instance = self.model()
             file_match = re.search(
                 "(?P<signature>.+?)\.?(?P<pk>\d+)?\.json$", entry.name)
             file_info = file_match.groupdict()
             instance.signature = file_info["signature"]
             instance.file_path = entry.path  # this property gets added especially for the command
             datasets.append(instance)
     return datasets
示例#12
0
 def handle_community(self, community, *args, **options):
     source = get_dumps_path(community)
     file_name = os.path.join(source, "{}.json".format(self.signature))
     if not os.path.exists(file_name):
         print("Dump with signature {} does not exist".format(self.signature))
         exit(1)
     with open(file_name, "r") as dump_file:
         batch_count = 0
         for _ in dump_file.readlines():
             batch_count += 1
         dump_file.seek(0)
         for line in tqdm(dump_file, total=batch_count):
             objects = [wrapper.object for wrapper in deserialize("json", line)]
             self.bulk_create_objects(objects)
示例#13
0
    def handle_label(self, freeze_label, **options):

        freeze = Freeze.objects.get(name=freeze_label)

        destination = get_dumps_path(freeze)
        if not os.path.exists(destination):
            os.makedirs(destination)
        file_name = os.path.join(destination,
                                 "{}.{}.json".format(freeze.name, freeze.id))
        with open(file_name, "w") as json_file:
            object_to_disk(freeze, json_file)
            queryset_to_disk(freeze.edurepsource_set, json_file)
            queryset_to_disk(freeze.edurepharvest_set, json_file)
            queryset_to_disk(freeze.indices, json_file)
            queryset_to_disk(freeze.collection_set, json_file)
            queryset_to_disk(freeze.arrangement_set, json_file)
            queryset_to_disk(freeze.document_set, json_file)

        self.dump_resources()
示例#14
0
 def handle_dataset(self, dataset, *args, **options):
     # transform_community = options.get("transform_community", False)
     # if transform_community:
     #     self.Document = apps.get_model(dataset._meta.app_label, "Document")
     #     self.Individual = apps.get_model("core", "Individual")
     #     self.Collection = apps.get_model(dataset._meta.app_label, "Collection")
     #     self.Collective = apps.get_model("core", "Collective")
     source = get_dumps_path(dataset)
     file_name = os.path.join(source, "{}.json".format(dataset.signature))
     if not os.path.exists(file_name):
         log.error("Dump with signature {} does not exist".format(dataset.signature))
         exit(1)
     with open(file_name, "r") as dump_file:
         batch_count = 0
         for _ in dump_file.readlines():
             batch_count += 1
         dump_file.seek(0)
         for line in tqdm(dump_file, total=batch_count):
             objects = [wrapper.object for wrapper in deserialize("json", line)]
             self.bulk_create_objects(objects, False)
示例#15
0
 def get_file_path(self, mode):
     dump_path = get_dumps_path(HttpResourceMock)
     file_name = "{}-dump-test.json".format(mode)
     return os.path.join(dump_path, file_name)