예제 #1
0
 def test_get_partial_set(self):
     seeds = get_edurep_oaipmh_seeds(
         "surf",
         make_aware(datetime(year=2020, month=2, day=10, hour=22,
                             minute=22)))
     self.assertEqual(len(seeds), 6)
     self.check_seed_integrity(seeds)
예제 #2
0
 def test_get_complete_set_without_deletes(self):
     seeds = get_edurep_oaipmh_seeds("surf",
                                     make_aware(
                                         datetime(year=1970, month=1,
                                                  day=1)),
                                     include_deleted=False)
     self.assertEqual(len(seeds), 10)
     self.check_seed_integrity(seeds, include_deleted=False)
예제 #3
0
 def test_get_partial_set_without_deletes(self):
     seeds = get_edurep_oaipmh_seeds("surf",
                                     make_aware(
                                         datetime(year=2020,
                                                  month=2,
                                                  day=10,
                                                  hour=22,
                                                  minute=22)),
                                     include_deleted=False)
     self.assertEqual(len(seeds), 4)
     self.check_seed_integrity(seeds, include_deleted=False)
예제 #4
0
    def handle(self, *args, **options):

        freeze_name = options["freeze"]
        is_dummy = options["dummy"]

        harvest_queryset = EdurepHarvest.objects.filter(
            freeze__name=freeze_name, stage=HarvestStages.BASIC)
        if not harvest_queryset.exists():
            raise EdurepHarvest.DoesNotExist(
                f"There are no scheduled and BASIC EdurepHarvest objects for '{freeze_name}'"
            )

        self.header("HARVEST EDUREP VIDEO", options)

        if is_dummy:
            self.info("Skipping command because dummy mode was specified")
            self.finish(harvest_queryset)
            return

        self.info("Extracting data from sources ...")
        seeds = []
        for harvest in self.progress(harvest_queryset,
                                     total=harvest_queryset.count()):
            set_specification = harvest.source.collection_name
            harvest_seeds = get_edurep_oaipmh_seeds(set_specification,
                                                    harvest.latest_update_at,
                                                    include_deleted=False)
            seeds += harvest_seeds
        self.info("Files considered for processing: {}".format(len(seeds)))

        self.info("Preparing video seeds ...")
        video_seeds = self.filter_video_seeds(seeds)
        self.info("Total videos: {}".format(len(video_seeds)))

        self.info("Downloading videos ...")
        download_scc, download_err = self.download_seed_videos(
            video_seeds.values())
        self.info("Errors while downloading audio from videos: {}".format(
            len(download_err)))
        self.info("Audio downloaded successfully: {}".format(
            len(download_scc)))

        self.info("Transcribing videos ...")
        no_paths_count, invalid_paths_count, no_language_count, success_count, error_count = \
            self.transcribe_video_resources(download_scc, video_seeds)
        self.info("Skipped video content due to missing audio file: {}".format(
            no_paths_count + invalid_paths_count))
        self.info("Skipped video content due to unknown language: {}".format(
            no_language_count))
        self.info("Errors while transcribing videos: {}".format(error_count))
        self.info("Videos transcribed successfully: {}".format(success_count))

        self.finish(harvest_queryset)
예제 #5
0
 def test_handle_deletion_seeds(self):
     freeze = Freeze.objects.last()
     collection = Collection.objects.create(name="surf", freeze=freeze)
     command = self.get_command_instance()
     deletes = [
         seed for seed in get_edurep_oaipmh_seeds(
             "surf", make_aware(datetime(year=1970, month=1, day=1)))
         if seed.get("state", "active") != "active"
     ]
     # Basically we're testing that deletion seeds are not triggering errors when their targets do not exist.
     command.handle_deletion_seeds(collection, deletes)
     self.assertEqual(collection.document_set.count(), 0)
     self.assertEqual(collection.arrangement_set.count(), 0)
예제 #6
0
 def test_handle_upsert_seeds(self):
     freeze = Freeze.objects.last()
     collection = Collection.objects.create(name="surf", freeze=freeze)
     command = self.get_command_instance()
     upserts = [
         seed for seed in get_edurep_oaipmh_seeds(
             "surf", make_aware(datetime(year=1970, month=1, day=1)))
         if seed.get("state", "active") == "active"
     ]
     skipped, dumped, documents_count = command.handle_upsert_seeds(
         collection, upserts)
     # When dealing with an entirely new Freeze
     # Then the arrangement count and document count should equal output of handle_upsert_seeds
     self.assertEqual(collection.arrangement_set.count(), dumped)
     self.assertEqual(collection.document_set.count(), documents_count)
예제 #7
0
 def test_handle_deletion_seeds(self):
     freeze = Freeze.objects.last()
     collection = Collection.objects.get(name="surf", freeze=freeze)
     command = self.get_command_instance()
     arrangement_count = collection.arrangement_set.count()
     document_count = collection.document_set.count()
     deletes = [
         seed for seed in get_edurep_oaipmh_seeds(
             "surf", make_aware(datetime(year=2019, month=12, day=31)))
         if seed.get("state", "active") != "active"
     ]
     arrangement_deletes, document_deletes = command.handle_deletion_seeds(
         collection, deletes)
     self.assertEqual(arrangement_deletes, 1)
     self.assertEqual(document_deletes, 1)
     self.assertEqual(
         collection.arrangement_set.count(), arrangement_count,
         "Did not expect arrangements to disappear after a delete")
     self.assertEqual(
         collection.arrangement_set.filter(
             deleted_at__isnull=False).count(), arrangement_deletes)
     self.assertEqual(collection.document_set.count(),
                      document_count - document_deletes)
예제 #8
0
 def test_get_complete_set(self):
     seeds = get_edurep_oaipmh_seeds(
         "surf", make_aware(datetime(year=1970, month=1, day=1)))
     self.assertEqual(len(seeds), 13)
     self.check_seed_integrity(seeds)
예제 #9
0
 def test_handle_upsert_seeds(self):
     freeze = Freeze.objects.last()
     collection = Collection.objects.get(name="surf", freeze=freeze)
     command = self.get_command_instance()
     # Checking the state before the test
     arrangement_count = collection.arrangement_set.count()
     document_count = collection.document_set.count()
     vortex_queryset = freeze.documents.filter(
         properties__title="Using a Vortex | Wageningen UR")
     handson_queryset = freeze.documents.filter(
         properties__title=
         "Hands-on exercise based on WEKA - Tuning and Testing")
     self.assertEqual(
         vortex_queryset.count(), 1,
         "Expected the start state to contain 'Using a Vortex'")
     self.assertEqual(
         handson_queryset.count(), 1,
         "Expected the start state to contain 'Hands-on exercise'")
     for doc in freeze.documents.all():
         self.assertEqual(doc.created_at, doc.modified_at,
                          f"Document is unexpectedly updated: {doc.id}")
     for arrangement in freeze.arrangement_set.all():
         self.assertEqual(
             arrangement.created_at, arrangement.modified_at,
             f"Arrangement is unexpectedly updated: {arrangement.id}")
     # Perform the test
     upserts = [
         seed for seed in get_edurep_oaipmh_seeds(
             "surf", make_aware(datetime(year=2019, month=12, day=31)))
         if seed.get("state", "active") == "active"
     ]
     skipped, dumped, documents_count = command.handle_upsert_seeds(
         collection, upserts)
     # Checking the state after the test
     self.assertEqual(skipped, 0)
     self.assertEqual(collection.arrangement_set.count(),
                      arrangement_count + 2,
                      "Upsert seeds should have added 2 Arrangements")
     self.assertEqual(
         collection.document_set.count(), document_count + 3,
         "Upsert seeds should have added 3 Documents (1 video arrangement got 2 documents)"
     )
     vortex_updateset = freeze.documents.filter(
         properties__title="Using a Vortex (responsibly) | Wageningen UR")
     self.assertEqual(vortex_updateset.count(), 2)
     self.assertEqual(vortex_queryset.count(), 0)
     handson_updateset = freeze.documents.filter(
         properties__title=
         "Hands-off exercise based on WEKA - Tuning and Testing")
     self.assertEqual(handson_updateset.count(), 1)
     self.assertEqual(handson_queryset.count(), 0)
     update_ids = set()
     for update in vortex_updateset:
         self.assertNotEqual(
             update.created_at, update.modified_at,
             f"Document is unexpectedly not updated: {update.id}")
         self.assertNotEqual(
             update.arrangement.created_at, update.arrangement.modified_at,
             f"Arrangement of document is unexpectedly not updated: {update.id}"
         )
         update_ids.add(update.id)
     for update in handson_updateset:
         self.assertNotEqual(
             update.created_at, update.modified_at,
             f"Document is unexpectedly not updated: {update.id}")
         self.assertNotEqual(
             update.arrangement.created_at, update.arrangement.modified_at,
             f"Arrangement of document is unexpectedly not updated: {update.id}"
         )
         update_ids.add(update.id)
     not_updated = freeze.documents.exclude(id__in=update_ids)
     self.assertNotEqual(not_updated.count(), 0)
     for not_update in not_updated:
         self.assertEqual(
             not_update.created_at.replace(microsecond=0),
             not_update.modified_at.replace(microsecond=0),
             f"Document is unexpectedly updated after upsert: {not_update.id}"
         )
예제 #10
0
    def handle(self, *args, **options):

        freeze_name = options["freeze"]
        freeze = Freeze.objects.get(name=freeze_name)

        harvest_queryset = EdurepHarvest.objects.filter(
            freeze__name=freeze_name, stage=HarvestStages.VIDEO)
        if not harvest_queryset.exists():
            raise EdurepHarvest.DoesNotExist(
                f"There are no scheduled and VIDEO EdurepHarvest objects for '{freeze_name}'"
            )

        self.header("FREEZE EDUREP", options)

        self.info("Extracting data from sources ...")
        seeds_by_collection = defaultdict(list)
        for harvest in self.progress(harvest_queryset,
                                     total=harvest_queryset.count()):
            set_specification = harvest.source.collection_name
            upserts = []
            deletes = []
            for seed in get_edurep_oaipmh_seeds(set_specification,
                                                harvest.latest_update_at):
                if seed.get("state", "active") == "active":
                    upserts.append(seed)
                else:
                    deletes.append(seed)
            seeds_by_collection[harvest.source.collection_name] += (
                upserts,
                deletes,
            )
        self.info(
            f"Files considered for processing, upserts:{len(upserts)} deletes:{len(deletes)}"
        )

        for collection_name, seeds in seeds_by_collection.items():
            # Unpacking seeds
            upserts, deletes = seeds

            # Get or create the collection these seeds belong to
            collection, created = Collection.objects.get_or_create(
                name=collection_name, freeze=freeze)
            collection.referee = "id"
            collection.save()
            if created:
                self.info("Created collection " + collection_name)
            else:
                self.info("Adding to collection " + collection_name)

            skipped, dumped, documents_count = self.handle_upsert_seeds(
                collection, upserts)
            deleted_arrangements, deleted_documents = self.handle_deletion_seeds(
                collection, deletes)

            self.info(
                f"Skipped URL's for {collection.name} during dump: {skipped}")
            self.info(f"Dumped Arrangements for {collection.name}: {dumped}")
            self.info(
                f"Dumped Documents for {collection.name}: {documents_count}")
            self.info(
                f"Deleted Arrangements for {collection.name}: {deleted_arrangements}"
            )
            self.info(
                f"Deleted Documents for {collection.name}: {deleted_documents}"
            )

        # Finish the freeze and harvest
        for harvest in harvest_queryset:
            harvest.stage = HarvestStages.COMPLETE
            harvest.save()