def test_deleted_activity_removal(self): db.session.add( DeletedActivity(iati_identifier='test_deleted_activity', deletion_date=datetime.datetime(2000, 1, 1))) db.session.commit() resource = fac.ResourceFactory.create( url=u"http://test", document=""" <iati-activities> <iati-activity> <iati-identifier>test_deleted_activity</iati-identifier> <title>test_deleted_activity</title> <reporting-org ref="GB-CHC-202918" type="21">Oxfam GB</reporting-org> </iati-activity> </iati-activities> """, ) self.assertIn("test_deleted_activity", [ da.iati_identifier for da in db.session.query(DeletedActivity).all() ]) resource = crawler.parse_resource(resource) db.session.commit() self.assertNotIn( "test_deleted_activity", [da.iati_identifier for da in DeletedActivity.query.all()])
def parse_resource(resource): db.session.add(resource) now = datetime.datetime.utcnow() current = Activity.query.filter_by(resource_url=resource.url) current_identifiers = set([i.iati_identifier for i in current.all()]) old_xml = dict([(i[0], (i[1], hash(i[2]))) for i in db.session.query( Activity.iati_identifier, Activity.last_change_datetime, Activity.raw_xml).filter_by(resource_url=resource.url)]) db.session.query(Activity).filter_by(resource_url=resource.url).delete() new_identifiers = set() activities = [] for activity in parse.document(resource.document, resource): activity.resource = resource new_identifiers.add(activity.iati_identifier) try: if hash(activity.raw_xml) == old_xml[activity.iati_identifier][1]: activity.last_change_datetime = old_xml[ activity.iati_identifier][0] else: activity.last_change_datetime = datetime.datetime.now() except KeyError: activity.last_change_datetime = datetime.datetime.now() activities.append(activity) db.session.add(activity) if len(db.session.new) > 50: activities = check_for_duplicates(activities) db.session.commit() activities = [] db.session.add_all(activities) activities = check_for_duplicates(activities) db.session.commit() resource.version = parse.document_metadata(resource.document) #add any identifiers that are no longer present to deleted_activity table diff = current_identifiers - new_identifiers now = datetime.datetime.utcnow() deleted = [ DeletedActivity(iati_identifier=deleted_activity, deletion_date=now) for deleted_activity in diff ] if deleted: db.session.add_all(deleted) #remove any new identifiers from the deleted_activity table if new_identifiers: db.session.query(DeletedActivity)\ .filter(DeletedActivity.iati_identifier.in_(new_identifiers))\ .delete(synchronize_session="fetch") log.info("Parsed %d activities from %s", len(resource.activities), resource.url) resource.last_parsed = now return resource #, new_identifiers
def delete_dataset(dataset): deleted_dataset = db.session.query(Dataset). \ filter(Dataset.name == dataset) activities_to_delete = db.session.query(Activity). \ filter(Activity.resource_url == Resource.url). \ filter(Resource.dataset_id == dataset) now = datetime.datetime.now() for a in activities_to_delete: db.session.merge( DeletedActivity(iati_identifier=a.iati_identifier, deletion_date=now)) db.session.commit() return deleted_dataset.delete(synchronize_session='fetch')
def delete_datasets(datasets): deleted_datasets = db.session.query(Dataset).filter( Dataset.name.in_(datasets)) activities_to_delete = db.session.query(Activity).\ filter(Activity.resource_url==Resource.url).\ filter(Resource.dataset_id.in_(datasets)) now = datetime.datetime.now() deleted_activities = [ DeletedActivity(iati_identifier=a.iati_identifier, deletion_date=now) for a in activities_to_delete ] db.session.add_all(deleted_activities) db.session.commit() deleted = deleted_datasets.delete(synchronize_session='fetch') log.info("Deleted {0} datasets".format(deleted)) return deleted
def delete_datasets(datasets): deleted_datasets = db.session.query(Dataset).filter( Dataset.name.in_(datasets)) activities_to_delete = db.session.query(Activity). \ filter(Activity.resource_url == Resource.url). \ filter(Resource.dataset_id.in_(datasets)) now = datetime.datetime.now() deleted_activities = [] for i in range(0, activities_to_delete.count(), 100): # Slice the query to make sure it doesn't use up all the memory for a in activities_to_delete.slice(i, i + 100): deleted_activities.append( DeletedActivity(iati_identifier=a.iati_identifier, deletion_date=now)) db.session.add_all(deleted_activities) db.session.commit() deleted = deleted_datasets.delete(synchronize_session='fetch') log.info("Deleted {0} datasets".format(deleted)) return deleted
def parse_resource(resource): db.session.add(resource) current = Activity.query.filter_by(resource_url=resource.url) current_identifiers = set([i.iati_identifier for i in current.all()]) # obtains the iati-identifier, last-updated datetime, and a hash of the existing xml associated with # every activity associated with the current url. old_xml = dict([ (i[0], (i[1], hash(i[2].encode('utf-8')))) for i in db.session.query( Activity.iati_identifier, Activity.last_change_datetime, Activity.raw_xml).filter_by(resource_url=resource.url) ]) db.session.query(Activity).filter_by(resource_url=resource.url).delete() new_identifiers = set() parse_activity(new_identifiers, old_xml, resource) resource.version = parse.document_metadata(resource.document) # add any identifiers that are no longer present to deleted_activity table diff = current_identifiers - new_identifiers now = datetime.datetime.utcnow() deleted = [ DeletedActivity(iati_identifier=deleted_activity, deletion_date=now) for deleted_activity in diff ] if deleted: db.session.add_all(deleted) # remove any new identifiers from the deleted_activity table if new_identifiers: db.session.query(DeletedActivity) \ .filter(DeletedActivity.iati_identifier.in_(new_identifiers)) \ .delete(synchronize_session="fetch") log.info("Parsed %d activities from %s", resource.activities.count(), resource.url) resource.last_parsed = now return resource # , new_identifiers