def document(xml_resource, resource=None): xmlfile = _open_resource(xml_resource) try: for event, elem in ET.iterparse(xmlfile): if elem.tag == 'iati-activity': try: yield activity(elem) except Exception, exe: db_log = Log() resource_url = "" dataset = "" if resource: resource_url = resource.url dataset = resource.dataset_id db_log.dataset = dataset db_log.resource = resource_url db_log.logger = "Parser" db_log.msg = "Failed to parse activity {0} for {1}".format(exe, resource_url) db_log.dataset = dataset db_log.level = "warn" db_log.trace = traceback.format_exc() db.session.add(db_log) db.session.commit() log.warn("Failed to parse activity %r", exe) elem.clear() except ET.XMLSyntaxError, exe: raise XMLError(exe.msg)
def update_activities(resource_url): #clear up previous job queue log errors db.session.query(Log).filter( sa.and_( Log.logger == 'job iatilib.crawler.update_activities', Log.resource == resource_url, )).delete(synchronize_session=False) db.session.commit() resource = Resource.query.get(resource_url) try: db.session.query(Log).filter( sa.and_( Log.logger.in_( ['activity_importer', 'failed_activity', 'xml_parser']), Log.resource == resource_url, )).delete(synchronize_session=False) parse_resource(resource) db.session.commit() except parse.ParserError, exc: db.session.rollback() resource.last_parse_error = str(exc) db.session.add( Log(dataset=resource.dataset_id, resource=resource.url, logger="xml_parser", msg="Failed to parse XML file {0} error was".format( resource_url, exc), level="error", trace=traceback.format_exc(), created_at=datetime.datetime.now())) db.session.commit()
def emit(self, record): trace = None exc = record.__dict__['exc_info'] if exc: trace = traceback.format_exc() log = Log( dataset=record.msg.dataset, resource=record.msg.resource, logger=record.msg.logger, level=record.__dict__['levelname'], trace=trace, msg=record.msg.message, created_at=datetime.fromtimestamp(record.created) ) db.session.add(log)
def update_dataset(dataset_name, ignore_hashes): ''' Takes the dataset name and determines whether or not an update is needed based on whether or not the last successful update detail exits, and whether or not it last updated since the contained data was updated. If ignore_hashes is set to true, an update will be triggered, regardless of whether there appears to be any change in the dataset hash compared with that stored in the database. :param dataset_name: :param ignore_hashes: :return: ''' # clear up previous job queue log errors db.session.query(Log).filter( sa.and_( Log.logger == 'job iatilib.crawler.update_dataset', Log.dataset == dataset_name, )).delete(synchronize_session=False) db.session.commit() queue = rq.get_queue() dataset = Dataset.query.get(dataset_name) fetch_dataset_metadata(dataset) try: db.session.commit() except sa.exc.IntegrityError as exc: db.session.rollback() # the resource can't be added, so we should # give up. db.session.add( Log(dataset=dataset_name, resource=None, logger="update_dataset", msg="Failed to update dataset {0}, error was".format( dataset_name, exc), level="error", trace=traceback.format_exc(), created_at=datetime.datetime.now())) db.session.commit() return resource = fetch_resource(dataset, ignore_hashes) db.session.commit() if resource.last_status_code == 200 and not resource.last_parsed: queue.enqueue(update_activities, args=(dataset_name, ), result_ttl=0, job_timeout=100000)
def db_log_exception(job, exc_type, exc_value, tb): # as this is called when an exception occurs session is probably a mess db.session.remove() url = "noresource" dataset = Dataset.query.get(job.args[0]) if not dataset: dataset = "nodataset" elif dataset.resources: url = dataset.resources[0].url log = Log(logger="job {0}".format(job.func_name), dataset=dataset, resource=url, msg="Exception in job %r" % job.description, level="error", trace=traceback.format_exception(exc_type, exc_value, tb)) db.session.add(log) db.session.commit() job.cancel() job.delete()
def update_activities(dataset_name): ''' Parses and stores the raw XML associated with a resource [see parse_resource()], or logs the invalid resource :param resource_url: :return: ''' # clear up previous job queue log errors db.session.query(Log).filter( sa.and_( Log.logger == 'job iatilib.crawler.update_activities', Log.resource == dataset_name, )).delete(synchronize_session=False) db.session.commit() dataset = Dataset.query.get(dataset_name) resource = dataset.resources[0] try: db.session.query(Log).filter( sa.and_( Log.logger.in_( ['activity_importer', 'failed_activity', 'xml_parser']), Log.resource == dataset_name, )).delete(synchronize_session=False) parse_resource(resource) db.session.commit() except parse.ParserError as exc: db.session.rollback() resource.last_parse_error = str(exc) db.session.add(resource) db.session.add( Log(dataset=resource.dataset_id, resource=resource.url, logger="xml_parser", msg="Failed to parse XML file {0} error was".format( dataset_name, exc), level="error", trace=traceback.format_exc(), created_at=datetime.datetime.now())) db.session.commit()