예제 #1
0
def document(xml_resource, resource=None):
    xmlfile = _open_resource(xml_resource)
    try:
        for event, elem in ET.iterparse(xmlfile):
            if elem.tag == 'iati-activity':
                try:
                    yield activity(elem)
                except Exception, exe:

                    db_log = Log()

                    resource_url = ""
                    dataset = ""
                    if resource:
                        resource_url = resource.url
                        dataset = resource.dataset_id

                    db_log.dataset = dataset
                    db_log.resource = resource_url
                    db_log.logger = "Parser"
                    db_log.msg = "Failed to parse activity {0} for {1}".format(exe, resource_url)
                    db_log.dataset = dataset

                    db_log.level = "warn"
                    db_log.trace = traceback.format_exc()
                    db.session.add(db_log)
                    db.session.commit()
                    log.warn("Failed to parse activity %r", exe)

                elem.clear()
    except ET.XMLSyntaxError, exe:
        raise XMLError(exe.msg)
예제 #2
0
def update_activities(resource_url):
    #clear up previous job queue log errors
    db.session.query(Log).filter(
        sa.and_(
            Log.logger == 'job iatilib.crawler.update_activities',
            Log.resource == resource_url,
        )).delete(synchronize_session=False)
    db.session.commit()

    resource = Resource.query.get(resource_url)
    try:
        db.session.query(Log).filter(
            sa.and_(
                Log.logger.in_(
                    ['activity_importer', 'failed_activity', 'xml_parser']),
                Log.resource == resource_url,
            )).delete(synchronize_session=False)
        parse_resource(resource)
        db.session.commit()
    except parse.ParserError, exc:
        db.session.rollback()
        resource.last_parse_error = str(exc)
        db.session.add(
            Log(dataset=resource.dataset_id,
                resource=resource.url,
                logger="xml_parser",
                msg="Failed to parse XML file {0} error was".format(
                    resource_url, exc),
                level="error",
                trace=traceback.format_exc(),
                created_at=datetime.datetime.now()))
        db.session.commit()
예제 #3
0
 def emit(self, record):
     trace = None
     exc = record.__dict__['exc_info']
     if exc:
         trace = traceback.format_exc()
     log = Log(
         dataset=record.msg.dataset,
         resource=record.msg.resource,
         logger=record.msg.logger,
         level=record.__dict__['levelname'],
         trace=trace,
         msg=record.msg.message,
         created_at=datetime.fromtimestamp(record.created)
     )
     db.session.add(log)
예제 #4
0
def update_dataset(dataset_name, ignore_hashes):
    '''
    Takes the dataset name and determines whether or not an update is needed based on whether or not the last
    successful update detail exits, and whether or not it last updated since the contained data was updated.
    If ignore_hashes is set to true, an update will be triggered, regardless of whether there appears
    to be any change in the dataset hash compared with that stored in the database.
    :param dataset_name:
    :param ignore_hashes:
    :return:
    '''
    # clear up previous job queue log errors
    db.session.query(Log).filter(
        sa.and_(
            Log.logger == 'job iatilib.crawler.update_dataset',
            Log.dataset == dataset_name,
        )).delete(synchronize_session=False)
    db.session.commit()

    queue = rq.get_queue()
    dataset = Dataset.query.get(dataset_name)

    fetch_dataset_metadata(dataset)
    try:
        db.session.commit()
    except sa.exc.IntegrityError as exc:
        db.session.rollback()
        # the resource can't be added, so we should
        # give up.
        db.session.add(
            Log(dataset=dataset_name,
                resource=None,
                logger="update_dataset",
                msg="Failed to update dataset {0}, error was".format(
                    dataset_name, exc),
                level="error",
                trace=traceback.format_exc(),
                created_at=datetime.datetime.now()))
        db.session.commit()
        return

    resource = fetch_resource(dataset, ignore_hashes)
    db.session.commit()

    if resource.last_status_code == 200 and not resource.last_parsed:
        queue.enqueue(update_activities,
                      args=(dataset_name, ),
                      result_ttl=0,
                      job_timeout=100000)
예제 #5
0
def db_log_exception(job, exc_type, exc_value, tb):
    # as this is called when an exception occurs session is probably a mess
    db.session.remove()

    url = "noresource"
    dataset = Dataset.query.get(job.args[0])
    if not dataset:
        dataset = "nodataset"
    elif dataset.resources:
        url = dataset.resources[0].url

    log = Log(logger="job {0}".format(job.func_name),
              dataset=dataset,
              resource=url,
              msg="Exception in job %r" % job.description,
              level="error",
              trace=traceback.format_exception(exc_type, exc_value, tb))
    db.session.add(log)
    db.session.commit()
    job.cancel()
    job.delete()
예제 #6
0
def update_activities(dataset_name):
    '''
    Parses and stores the raw XML associated with a resource [see parse_resource()], or logs the invalid resource
    :param resource_url:
    :return:
    '''
    # clear up previous job queue log errors
    db.session.query(Log).filter(
        sa.and_(
            Log.logger == 'job iatilib.crawler.update_activities',
            Log.resource == dataset_name,
        )).delete(synchronize_session=False)
    db.session.commit()

    dataset = Dataset.query.get(dataset_name)
    resource = dataset.resources[0]
    try:
        db.session.query(Log).filter(
            sa.and_(
                Log.logger.in_(
                    ['activity_importer', 'failed_activity', 'xml_parser']),
                Log.resource == dataset_name,
            )).delete(synchronize_session=False)
        parse_resource(resource)
        db.session.commit()
    except parse.ParserError as exc:
        db.session.rollback()
        resource.last_parse_error = str(exc)
        db.session.add(resource)
        db.session.add(
            Log(dataset=resource.dataset_id,
                resource=resource.url,
                logger="xml_parser",
                msg="Failed to parse XML file {0} error was".format(
                    dataset_name, exc),
                level="error",
                trace=traceback.format_exc(),
                created_at=datetime.datetime.now()))
        db.session.commit()