def test_prepare_harvest_new_version(self):
     prepare_harvest(self.dataset)
     # See if harvest state is correct
     self.assertEqual(Harvest.objects.all().count(), 3)
     self.assertEqual(Harvest.objects.filter(stage=HarvestStages.NEW).count(), 3)
     for harvest in Harvest.objects.filter(stage=HarvestStages.NEW):
         if harvest.source.delete_policy == DeletePolicies.NO:
             self.assertEqual(harvest.latest_update_at, self.begin_of_time)
             self.assertIsNone(harvest.harvested_at)
         else:
             self.assertEqual(harvest.latest_update_at, self.last_harvest)
             self.assertEqual(harvest.harvested_at, self.last_harvest)
     # Check what happened with resources
     self.assertEqual(EdurepOAIPMH.objects.all().count(), 2)
     self.assertEqual(
         EdurepOAIPMH.objects.filter(is_extracted=True).count(), 2,
         "Expected previously extracted resources to get marked as extracted"
     )
     self.assertEqual(
         SharekitMetadataHarvest.objects.all().count(), 0,
         "Expected sources with no delete policy to get completely refreshed "
         "and old irrelevant resources to get deleted"
     )
     # Check what happened with Dataset
     self.assertEqual(DatasetVersion.objects.all().count(), 2)
     self.assertEqual(DatasetVersion.objects.filter(is_current=False).count(), 1)
     dataset_version = DatasetVersion.objects.filter(is_current=False).last()
     self.assertEqual(dataset_version.version, settings.VERSION)
     self.assertEqual(dataset_version.collection_set.all().count(), 1)
     self.assertEqual(dataset_version.document_set.all().count(), 2)
     self.assertEqual(Collection.objects.all().count(), 3, "Expected 2 old + 1 new Collections")
     self.assertEqual(Document.objects.all().count(), 6, "Expected 4 old + 2 new Documents")
 def test_prepare_harvest_purge(self):
     # Sets Wikiwijs purge_after to the past and force a implicit "reset"
     self.wikiwijs_harvest.purge_after = self.last_harvest
     self.wikiwijs_harvest.save()
     prepare_harvest(self.dataset)
     # See if harvest state is correct
     self.assertEqual(Harvest.objects.all().count(), 3)
     self.assertEqual(Harvest.objects.filter(stage=HarvestStages.NEW).count(), 3)
     for harvest in Harvest.objects.filter(stage=HarvestStages.NEW):
         if harvest.source.spec != "edurep_delen":
             self.assertEqual(harvest.latest_update_at, self.begin_of_time)
             self.assertIsNone(harvest.harvested_at)
         else:
             self.assertEqual(harvest.latest_update_at, self.last_harvest)
             self.assertEqual(harvest.harvested_at, self.last_harvest)
     # Check what happened with resources
     self.assertEqual(EdurepOAIPMH.objects.all().count(), 1)
     self.assertEqual(
         EdurepOAIPMH.objects.filter(is_extracted=True).count(), 1,
         "Expected previously extracted resources to get marked as extracted"
     )
     self.assertEqual(
         SharekitMetadataHarvest.objects.all().count(), 0,
         "Expected sources with no delete policy to get completely refreshed "
         "and old irrelevant resources to get deleted"
     )
     # Check what happened with Dataset
     self.assertEqual(DatasetVersion.objects.all().count(), 2)
     self.assertEqual(DatasetVersion.objects.filter(is_current=False).count(), 1)
     dataset_version = DatasetVersion.objects.filter(is_current=False).last()
     self.assertEqual(dataset_version.version, settings.VERSION)
     self.assertEqual(dataset_version.collection_set.all().count(), 0)
     self.assertEqual(dataset_version.document_set.all().count(), 0)
     self.assertEqual(Collection.objects.all().count(), 2, "Expected no new Collections")
     self.assertEqual(Document.objects.all().count(), 4, "Expected no new Documents")
 def test_prepare_harvest_reset(self):
     prepare_harvest(self.dataset, reset=True)
     # See if harvest state is correct
     self.assertEqual(Harvest.objects.all().count(), 3)
     self.assertEqual(Harvest.objects.filter(stage=HarvestStages.NEW).count(), 3)
     for harvest in Harvest.objects.filter(stage=HarvestStages.NEW):
         self.assertEqual(harvest.latest_update_at, self.begin_of_time)
         self.assertIsNone(harvest.harvested_at)
     # Check what happened with resources
     self.assertEqual(EdurepOAIPMH.objects.all().count(), 0)
     # Check what happened with Dataset
     self.assertEqual(DatasetVersion.objects.all().count(), 1)
     dataset_version = DatasetVersion.objects.last()
     self.assertFalse(dataset_version.is_current)
     self.assertEqual(dataset_version.version, settings.VERSION)
     self.assertEqual(dataset_version.collection_set.all().count(), 0)
     self.assertEqual(dataset_version.document_set.all().count(), 0)
Exemplo n.º 4
0
def harvest(reset=False, no_promote=False, report_dataset_version=False):

    if reset:
        call_command("extend_resource_cache")

    # Iterate over all active datasets to get data updates
    for dataset in Dataset.objects.filter(is_active=True):
        # Preparing dataset state and deletes old model instances
        prepare_harvest(dataset, reset=reset)
        # First we call the commands that will query the repository interfaces
        repositories = [
            Repositories.EDUREP, Repositories.SHAREKIT,
            Repositories.ANATOMY_TOOL, Repositories.HANZE, Repositories.HAN,
            Repositories.HKU, Repositories.GREENI, Repositories.HVA,
            Repositories.BUAS
        ]
        for repository in repositories:
            try:
                call_command("harvest_metadata", f"--dataset={dataset.name}",
                             f"--repository={repository}")
            except CommandError as exc:
                logger = HarvestLogger(dataset, "harvest_task", {
                    "dataset": dataset.name,
                    "repository": repository
                })
                logger.error(str(exc))

        # After getting all the metadata we'll download content
        call_command("harvest_basic_content", f"--dataset={dataset.name}",
                     "--async")
        # We skip any video downloading/processing for now
        # Later we want YoutubeDL to download the videos and Amber to process them
        # Thumbnails are only enabled for Edusources not NPPO
        if settings.PROJECT == "edusources":
            Harvest.objects.filter(stage=HarvestStages.BASIC).update(
                stage=HarvestStages.PREVIEW)
            call_command("generate_previews", f"--dataset={dataset.name}",
                         "--async")
        else:
            Harvest.objects.filter(stage=HarvestStages.BASIC).update(
                stage=HarvestStages.COMPLETE)
        # Based on the dataset we push to search engine
        index_command = ["index_dataset_version", f"--dataset={dataset.name}"]
        if no_promote or not dataset.is_latest:
            index_command += ["--no-promote"]
        if reset:
            index_command += ["--skip-evaluation"]
        call_command(*index_command)

    # When dealing with a harvest on AWS seeds need to get copied to S3.
    # Localhost can use these copies instead of getting the seeds from behind Edurep's firewall.
    if settings.AWS_STORAGE_BUCKET_NAME:
        call_command("dump_resource", "edurep.EdurepOAIPMH")
        ctx = Context(environment)
        harvester_data_bucket = f"s3://{settings.AWS_STORAGE_BUCKET_NAME}/datasets/harvester/edurep"
        ctx.run(
            f"aws s3 sync --no-progress {settings.DATAGROWTH_DATA_DIR}/edurep {harvester_data_bucket}",
            echo=True)

    # Log the totals when scheduled
    if report_dataset_version:
        dataset_version = DatasetVersion.objects.get_current_version()
        logger = HarvestLogger(dataset_version.dataset.name, "harvest_task",
                               {})
        logger.report_dataset_version(dataset_version)