def test_deserialize_wrong_collection_set(self): # This is testing that deserialization is not permitted when collection is placed # in incorrect collection set directory. serializer = serialize.RecordSerializer(data_dir=self.data_dir) serializer.serialize_collection_set(self.collection_set) # Partially clean the database in preparation for deserializing self.collection1.delete() self.collection_set.delete() self.assertTrue(os.path.exists(self.collection1_records_path)) new_collection_set_path = "{}x".format(self.collection_set_path) shutil.move(self.collection_set_path, new_collection_set_path) self.assertFalse(os.path.exists(self.collection1_records_path)) deserializer = serialize.RecordDeserializer(data_dir=self.data_dir) caught_error = False try: deserializer.deserialize_collection_set(new_collection_set_path) except DeserializationError: caught_error = True self.assertTrue(caught_error)
def test_serialize_by_collection(self): serializer = serialize.RecordSerializer(data_dir=self.data_dir) serializer.serialize_collection_set(self.collection_set) deserializer = serialize.RecordDeserializer(data_dir=self.data_dir) deserializer.deserialize_collection(self.collection2_path) deserializer.deserialize_collection(self.collection1_path) # Nothing should change self.assertEqual(2, Group.objects.count()) self.assertEqual(1, CollectionSet.objects.count()) self.assertEqual(2, CollectionSet.history.count()) self.assertEqual(2, Collection.objects.count()) self.assertEqual(3, Collection.history.count()) self.assertEqual(3, Credential.objects.count()) self.assertEqual(4, Credential.history.count()) self.assertEqual(2, User.objects.count()) self.assertEqual(2, Seed.objects.count()) self.assertEqual(3, Seed.history.count()) self.assertEqual(3, Harvest.objects.count()) self.assertEqual(3, HarvestStat.objects.count()) self.assertEqual(2, Warc.objects.count()) # Partially clean the database in preparation for deserializing Warc.objects.all().delete() HarvestStat.objects.all().delete() self.harvest1.delete() self.harvest2.delete() self.harvest3.delete() Seed.objects.all().delete() Seed.history.all().delete() self.collection1.delete() self.collection2.delete() Collection.history.all().delete() # Note that credential1 still exists. self.credential2.delete() self.credential3.delete() # This is also deleting credential1's history Credential.history.all().delete() # self.group2.delete() # Note that group1 and group2 still exists. self.assertEqual(2, Group.objects.count()) # Note that user1 still exists self.user2.delete() self.assertEqual(1, User.objects.count()) self.assertEqual(1, Credential.objects.count()) # Note that collection set still exists self.assertEqual(1, CollectionSet.objects.count()) self.assertEqual(2, CollectionSet.history.count()) # Now deserialize again deserializer.deserialize_collection(self.collection2_path) deserializer.deserialize_collection(self.collection1_path) # And check the deserialization self.assertEqual(2, Group.objects.count()) self.assertEqual(1, CollectionSet.objects.count()) self.assertEqual(2, CollectionSet.history.count()) self.assertEqual(2, Collection.objects.count()) # +2 for turning off collection after it is deserialized # +2 for added history note self.assertEqual(7, Collection.history.count()) self.assertEqual(Collection.history.first().instance.history_note, "Collection imported.") self.assertEqual(3, Credential.objects.count()) # This is one less since credential1's history was deleted. self.assertEqual(3, Credential.history.count()) self.assertEqual(2, User.objects.count()) self.assertEqual(2, Seed.objects.count()) self.assertEqual(3, Seed.history.count()) self.assertEqual(3, Harvest.objects.count()) self.assertEqual(3, HarvestStat.objects.count()) self.assertEqual(2, Warc.objects.count()) # Number of historical records for particular objects self.assertEqual( 4, Collection.objects.get( collection_id=self.collection1.collection_id).history.count()) # Make sure we got the right historical objects for h_collection in Collection.objects.get( collection_id=self.collection1.collection_id).history.all(): self.assertEqual("test_collection1", h_collection.name) self.assertEqual( 2, CollectionSet.objects.get(collection_set_id=self.collection_set. collection_set_id).history.count()) self.assertEqual( 2, Credential.objects.get( credential_id=self.credential2.credential_id).history.count()) # Make sure we got the right historical objects for h_credential in Credential.objects.get( credential_id=self.credential2.credential_id).history.all(): self.assertEqual("test_platform2", h_credential.platform) self.assertEqual( 2, Seed.objects.get(seed_id=self.seed1.seed_id).history.count())
def test_should_serialize(self): if not os.path.exists(self.collection1_records_path): os.makedirs(self.collection1_records_path) serializer = serialize.RecordSerializer(data_dir=self.data_dir) # serializer.serialize_collection_set(self.collection_set) # No existing info.json self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Existing serialization_date before last update serializer._write_info(datetime.utcnow() - timedelta(days=1), self.collection1_records_path) self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Existing serialization after last update serializer._write_info(datetime.utcnow() + timedelta(days=1), self.collection1_records_path) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Update collection serializer._write_info(datetime.utcnow(), self.collection1_records_path) sleep(.5) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) self.collection_set.description = "Changing the description" self.collection_set.save() self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Update collection set serializer._write_info(datetime.utcnow(), self.collection1_records_path) sleep(.5) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) self.collection1.description = "Changing the description" self.collection1.save() self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Update seed serializer._write_info(datetime.utcnow(), self.collection1_records_path) sleep(.5) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) self.seed1.token = '{"token":"token1.2}' self.seed1.save() self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Add seed serializer._write_info(datetime.utcnow(), self.collection1_records_path) sleep(.5) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) Seed.objects.create(collection=self.collection1, token='{"token":"token3}', is_active=False) self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Update harvest serializer._write_info(datetime.utcnow(), self.collection1_records_path) sleep(.5) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) self.harvest1.status = Harvest.SUCCESS self.harvest1.save() self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Add harvest serializer._write_info(datetime.utcnow(), self.collection1_records_path) sleep(.5) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) Harvest.objects.create( collection=self.collection1, historical_collection=self.historical_collection, historical_credential=self.historical_credential) self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path)) # Add warc serializer._write_info(datetime.utcnow(), self.collection1_records_path) sleep(.5) self.assertFalse( serializer._should_serialize(self.collection1, self.collection1_records_path)) Warc.objects.create(harvest=self.harvest1, warc_id=default_uuid(), path="/data/warc3.warc.gz", sha1="warc3sha", bytes=10, date_created=datetime.utcnow()) self.assertTrue( serializer._should_serialize(self.collection1, self.collection1_records_path))
def test_serialize(self): self.assertTrue(self.collection1.is_active) serializer = serialize.RecordSerializer(data_dir=self.data_dir) serializer.serialize_collection_set(self.collection_set) # Records files exist self.assertTrue(os.path.exists(self.collection1_records_path)) # collection set, historical collection set, groups # collection, historical collection, credentials, historical credentials # users, seed, historical seeds # harvests, harvest_stats, warcs, info self.assertEqual(14, len(os.listdir(self.collection1_records_path))) # Number of historical records for particular objects self.assertEqual(2, self.collection1.history.count()) self.assertEqual(2, self.collection_set.history.count()) self.assertEqual(2, self.credential2.history.count()) self.assertEqual(2, self.seed1.history.count()) # Deserialize while collection set already exists self.assertEqual(2, Group.objects.count()) self.assertEqual(1, CollectionSet.objects.count()) self.assertEqual(2, CollectionSet.history.count()) self.assertEqual(2, Collection.objects.count()) self.assertEqual(3, Collection.history.count()) self.assertEqual(3, Credential.objects.count()) self.assertEqual(4, Credential.history.count()) self.assertEqual(2, User.objects.count()) self.assertEqual(2, Seed.objects.count()) self.assertEqual(3, Seed.history.count()) self.assertEqual(3, Harvest.objects.count()) self.assertEqual(3, HarvestStat.objects.count()) self.assertEqual(2, Warc.objects.count()) deserializer = serialize.RecordDeserializer(data_dir=self.data_dir) deserializer.deserialize_collection_set(self.collection_set_path) # Nothing should change self.assertEqual(2, Group.objects.count()) self.assertEqual(1, CollectionSet.objects.count()) self.assertEqual(2, CollectionSet.history.count()) self.assertEqual(2, Collection.objects.count()) self.assertEqual(3, Collection.history.count()) self.assertEqual(3, Credential.objects.count()) self.assertEqual(4, Credential.history.count()) self.assertEqual(2, User.objects.count()) self.assertEqual(2, Seed.objects.count()) self.assertEqual(3, Seed.history.count()) self.assertEqual(3, Harvest.objects.count()) self.assertEqual(3, HarvestStat.objects.count()) self.assertEqual(2, Warc.objects.count()) # Partially clean the database in preparation for deserializing Warc.objects.all().delete() self.assertEqual(0, Warc.objects.count()) HarvestStat.objects.all().delete() self.assertEqual(0, HarvestStat.objects.count()) self.harvest1.delete() self.harvest2.delete() self.harvest3.delete() self.assertEqual(0, Harvest.objects.count()) Seed.objects.all().delete() self.assertEqual(0, Seed.objects.count()) Seed.history.all().delete() self.assertEqual(0, Seed.history.count()) self.collection1.delete() self.collection2.delete() self.assertEqual(0, Collection.objects.count()) Collection.history.all().delete() self.assertEqual(0, Collection.history.count()) # Note that credential1 still exists. self.credential2.delete() self.credential3.delete() self.assertEqual(1, Credential.objects.count()) # This is also deleting credential1's history Credential.history.all().delete() self.assertEqual(0, Credential.history.count()) self.collection_set.delete() self.assertEqual(0, CollectionSet.objects.count()) CollectionSet.history.all().delete() self.assertEqual(0, CollectionSet.history.count()) self.group2.delete() # Note that group1 still exists. self.assertEqual(1, Group.objects.count()) CollectionSet.history.all().delete() self.assertEqual(0, CollectionSet.history.count()) # Note that user1 still exists self.user2.delete() self.assertEqual(1, User.objects.count()) self.assertEqual(1, Credential.objects.count()) # Now deserialize again deserializer.deserialize_collection_set(self.collection_set_path) # And check the deserialization self.assertEqual(2, Group.objects.count()) self.assertEqual(1, CollectionSet.objects.count()) # +1 for added history note self.assertEqual(3, CollectionSet.history.count()) self.assertEqual(CollectionSet.history.first().instance.history_note, "Collection set imported.") self.assertEqual(2, Collection.objects.count()) # +2 for turning off collection after it is deserialized # +2 for added history note self.assertEqual(7, Collection.history.count()) self.assertEqual(Collection.history.first().instance.history_note, "Collection imported.") self.assertEqual(3, Credential.objects.count()) # This is one less since credential1's history was deleted. self.assertEqual(3, Credential.history.count()) self.assertEqual(2, User.objects.count()) self.assertEqual(2, Seed.objects.count()) self.assertEqual(3, Seed.history.count()) self.assertEqual(3, Harvest.objects.count()) self.assertEqual(3, HarvestStat.objects.count()) self.assertEqual(2, Warc.objects.count()) # Number of historical records for particular objects self.assertEqual( 4, Collection.objects.get( collection_id=self.collection1.collection_id).history.count()) # Make sure we got the right historical objects for h_collection in Collection.objects.get( collection_id=self.collection1.collection_id).history.all(): self.assertEqual("test_collection1", h_collection.name) self.assertEqual( 3, CollectionSet.objects.get(collection_set_id=self.collection_set. collection_set_id).history.count()) self.assertEqual( 2, Credential.objects.get( credential_id=self.credential2.credential_id).history.count()) # Make sure we got the right historical objects for h_credential in Credential.objects.get( credential_id=self.credential2.credential_id).history.all(): self.assertEqual("test_platform2", h_credential.platform) self.assertEqual( 2, Seed.objects.get(seed_id=self.seed1.seed_id).history.count()) # Collection turned off collection = Collection.objects.get_by_natural_key( self.collection1.collection_id) self.assertFalse(collection.is_active) # User2 is inactive self.assertFalse(User.objects.get(username="******").is_active) # READMEs self.assertTrue( os.path.exists(os.path.join(self.collection_set_path, "README.txt"))) self.assertTrue( os.path.exists(os.path.join(self.collection1_path, "README.txt"))) # Info file self.assertTrue( os.path.exists( os.path.join(self.collection1_records_path, "info.json")))