def test_get_partial_set(self): seeds = get_edurep_oaipmh_seeds( "surf", make_aware(datetime(year=2020, month=2, day=10, hour=22, minute=22))) self.assertEqual(len(seeds), 6) self.check_seed_integrity(seeds)
def test_get_complete_set_without_deletes(self): seeds = get_edurep_oaipmh_seeds("surf", make_aware( datetime(year=1970, month=1, day=1)), include_deleted=False) self.assertEqual(len(seeds), 10) self.check_seed_integrity(seeds, include_deleted=False)
def test_get_partial_set_without_deletes(self): seeds = get_edurep_oaipmh_seeds("surf", make_aware( datetime(year=2020, month=2, day=10, hour=22, minute=22)), include_deleted=False) self.assertEqual(len(seeds), 4) self.check_seed_integrity(seeds, include_deleted=False)
def handle(self, *args, **options): freeze_name = options["freeze"] is_dummy = options["dummy"] harvest_queryset = EdurepHarvest.objects.filter( freeze__name=freeze_name, stage=HarvestStages.BASIC) if not harvest_queryset.exists(): raise EdurepHarvest.DoesNotExist( f"There are no scheduled and BASIC EdurepHarvest objects for '{freeze_name}'" ) self.header("HARVEST EDUREP VIDEO", options) if is_dummy: self.info("Skipping command because dummy mode was specified") self.finish(harvest_queryset) return self.info("Extracting data from sources ...") seeds = [] for harvest in self.progress(harvest_queryset, total=harvest_queryset.count()): set_specification = harvest.source.collection_name harvest_seeds = get_edurep_oaipmh_seeds(set_specification, harvest.latest_update_at, include_deleted=False) seeds += harvest_seeds self.info("Files considered for processing: {}".format(len(seeds))) self.info("Preparing video seeds ...") video_seeds = self.filter_video_seeds(seeds) self.info("Total videos: {}".format(len(video_seeds))) self.info("Downloading videos ...") download_scc, download_err = self.download_seed_videos( video_seeds.values()) self.info("Errors while downloading audio from videos: {}".format( len(download_err))) self.info("Audio downloaded successfully: {}".format( len(download_scc))) self.info("Transcribing videos ...") no_paths_count, invalid_paths_count, no_language_count, success_count, error_count = \ self.transcribe_video_resources(download_scc, video_seeds) self.info("Skipped video content due to missing audio file: {}".format( no_paths_count + invalid_paths_count)) self.info("Skipped video content due to unknown language: {}".format( no_language_count)) self.info("Errors while transcribing videos: {}".format(error_count)) self.info("Videos transcribed successfully: {}".format(success_count)) self.finish(harvest_queryset)
def test_handle_deletion_seeds(self): freeze = Freeze.objects.last() collection = Collection.objects.create(name="surf", freeze=freeze) command = self.get_command_instance() deletes = [ seed for seed in get_edurep_oaipmh_seeds( "surf", make_aware(datetime(year=1970, month=1, day=1))) if seed.get("state", "active") != "active" ] # Basically we're testing that deletion seeds are not triggering errors when their targets do not exist. command.handle_deletion_seeds(collection, deletes) self.assertEqual(collection.document_set.count(), 0) self.assertEqual(collection.arrangement_set.count(), 0)
def test_handle_upsert_seeds(self): freeze = Freeze.objects.last() collection = Collection.objects.create(name="surf", freeze=freeze) command = self.get_command_instance() upserts = [ seed for seed in get_edurep_oaipmh_seeds( "surf", make_aware(datetime(year=1970, month=1, day=1))) if seed.get("state", "active") == "active" ] skipped, dumped, documents_count = command.handle_upsert_seeds( collection, upserts) # When dealing with an entirely new Freeze # Then the arrangement count and document count should equal output of handle_upsert_seeds self.assertEqual(collection.arrangement_set.count(), dumped) self.assertEqual(collection.document_set.count(), documents_count)
def test_handle_deletion_seeds(self): freeze = Freeze.objects.last() collection = Collection.objects.get(name="surf", freeze=freeze) command = self.get_command_instance() arrangement_count = collection.arrangement_set.count() document_count = collection.document_set.count() deletes = [ seed for seed in get_edurep_oaipmh_seeds( "surf", make_aware(datetime(year=2019, month=12, day=31))) if seed.get("state", "active") != "active" ] arrangement_deletes, document_deletes = command.handle_deletion_seeds( collection, deletes) self.assertEqual(arrangement_deletes, 1) self.assertEqual(document_deletes, 1) self.assertEqual( collection.arrangement_set.count(), arrangement_count, "Did not expect arrangements to disappear after a delete") self.assertEqual( collection.arrangement_set.filter( deleted_at__isnull=False).count(), arrangement_deletes) self.assertEqual(collection.document_set.count(), document_count - document_deletes)
def test_get_complete_set(self): seeds = get_edurep_oaipmh_seeds( "surf", make_aware(datetime(year=1970, month=1, day=1))) self.assertEqual(len(seeds), 13) self.check_seed_integrity(seeds)
def test_handle_upsert_seeds(self): freeze = Freeze.objects.last() collection = Collection.objects.get(name="surf", freeze=freeze) command = self.get_command_instance() # Checking the state before the test arrangement_count = collection.arrangement_set.count() document_count = collection.document_set.count() vortex_queryset = freeze.documents.filter( properties__title="Using a Vortex | Wageningen UR") handson_queryset = freeze.documents.filter( properties__title= "Hands-on exercise based on WEKA - Tuning and Testing") self.assertEqual( vortex_queryset.count(), 1, "Expected the start state to contain 'Using a Vortex'") self.assertEqual( handson_queryset.count(), 1, "Expected the start state to contain 'Hands-on exercise'") for doc in freeze.documents.all(): self.assertEqual(doc.created_at, doc.modified_at, f"Document is unexpectedly updated: {doc.id}") for arrangement in freeze.arrangement_set.all(): self.assertEqual( arrangement.created_at, arrangement.modified_at, f"Arrangement is unexpectedly updated: {arrangement.id}") # Perform the test upserts = [ seed for seed in get_edurep_oaipmh_seeds( "surf", make_aware(datetime(year=2019, month=12, day=31))) if seed.get("state", "active") == "active" ] skipped, dumped, documents_count = command.handle_upsert_seeds( collection, upserts) # Checking the state after the test self.assertEqual(skipped, 0) self.assertEqual(collection.arrangement_set.count(), arrangement_count + 2, "Upsert seeds should have added 2 Arrangements") self.assertEqual( collection.document_set.count(), document_count + 3, "Upsert seeds should have added 3 Documents (1 video arrangement got 2 documents)" ) vortex_updateset = freeze.documents.filter( properties__title="Using a Vortex (responsibly) | Wageningen UR") self.assertEqual(vortex_updateset.count(), 2) self.assertEqual(vortex_queryset.count(), 0) handson_updateset = freeze.documents.filter( properties__title= "Hands-off exercise based on WEKA - Tuning and Testing") self.assertEqual(handson_updateset.count(), 1) self.assertEqual(handson_queryset.count(), 0) update_ids = set() for update in vortex_updateset: self.assertNotEqual( update.created_at, update.modified_at, f"Document is unexpectedly not updated: {update.id}") self.assertNotEqual( update.arrangement.created_at, update.arrangement.modified_at, f"Arrangement of document is unexpectedly not updated: {update.id}" ) update_ids.add(update.id) for update in handson_updateset: self.assertNotEqual( update.created_at, update.modified_at, f"Document is unexpectedly not updated: {update.id}") self.assertNotEqual( update.arrangement.created_at, update.arrangement.modified_at, f"Arrangement of document is unexpectedly not updated: {update.id}" ) update_ids.add(update.id) not_updated = freeze.documents.exclude(id__in=update_ids) self.assertNotEqual(not_updated.count(), 0) for not_update in not_updated: self.assertEqual( not_update.created_at.replace(microsecond=0), not_update.modified_at.replace(microsecond=0), f"Document is unexpectedly updated after upsert: {not_update.id}" )
def handle(self, *args, **options): freeze_name = options["freeze"] freeze = Freeze.objects.get(name=freeze_name) harvest_queryset = EdurepHarvest.objects.filter( freeze__name=freeze_name, stage=HarvestStages.VIDEO) if not harvest_queryset.exists(): raise EdurepHarvest.DoesNotExist( f"There are no scheduled and VIDEO EdurepHarvest objects for '{freeze_name}'" ) self.header("FREEZE EDUREP", options) self.info("Extracting data from sources ...") seeds_by_collection = defaultdict(list) for harvest in self.progress(harvest_queryset, total=harvest_queryset.count()): set_specification = harvest.source.collection_name upserts = [] deletes = [] for seed in get_edurep_oaipmh_seeds(set_specification, harvest.latest_update_at): if seed.get("state", "active") == "active": upserts.append(seed) else: deletes.append(seed) seeds_by_collection[harvest.source.collection_name] += ( upserts, deletes, ) self.info( f"Files considered for processing, upserts:{len(upserts)} deletes:{len(deletes)}" ) for collection_name, seeds in seeds_by_collection.items(): # Unpacking seeds upserts, deletes = seeds # Get or create the collection these seeds belong to collection, created = Collection.objects.get_or_create( name=collection_name, freeze=freeze) collection.referee = "id" collection.save() if created: self.info("Created collection " + collection_name) else: self.info("Adding to collection " + collection_name) skipped, dumped, documents_count = self.handle_upsert_seeds( collection, upserts) deleted_arrangements, deleted_documents = self.handle_deletion_seeds( collection, deletes) self.info( f"Skipped URL's for {collection.name} during dump: {skipped}") self.info(f"Dumped Arrangements for {collection.name}: {dumped}") self.info( f"Dumped Documents for {collection.name}: {documents_count}") self.info( f"Deleted Arrangements for {collection.name}: {deleted_arrangements}" ) self.info( f"Deleted Documents for {collection.name}: {deleted_documents}" ) # Finish the freeze and harvest for harvest in harvest_queryset: harvest.stage = HarvestStages.COMPLETE harvest.save()