def _read(self, ti, try_number): """ Read logs of given task instance and try_number from GCS. If failed, read the log from task instance host machine. :param ti: task instance object :param try_number: task instance try_number to read logs from """ # Explicitly getting log relative path is necessary as the given # task instance might be different than task instance passed in # in set_context method. log_relative_path = self._render_filename(ti, try_number + 1) remote_loc = os.path.join(self.remote_base, log_relative_path) gcs_log = logging_utils.GCSLog() if gcs_log.log_exists(remote_loc): # If GCS remote file exists, we do not fetch logs from task instance # local machine even if there are errors reading remote logs, as # remote_log will contain error message. remote_log = gcs_log.read(remote_loc, return_error=True) log = '*** Reading remote log from {}.\n{}\n'.format( remote_loc, remote_log) else: log = super(GCSTaskHandler, self)._read(ti, try_number) return log
def close(self): """ Close and upload local log file to remote storage S3. """ # When application exit, system shuts down all handlers by # calling close method. Here we check if logger is already # closed to prevent uploading the log to remote storage multiple # times when `logging.shutdown` is called. if self.closed: return super(GCSTaskHandler, self).close() local_loc = os.path.join(self.local_base, self.log_relative_path) remote_loc = os.path.join(self.remote_base, self.log_relative_path) if os.path.exists(local_loc): # read log and remove old logs to get just the latest additions with open(local_loc, 'r') as logfile: log = logfile.read() logging_utils.GCSLog().write(log, remote_loc) self.closed = True
def test_gcs_url_parse(self): """ Test GCS url parsing """ _log.info( 'About to create a GCSLog object without a connection. This will ' 'log an error but testing will proceed.') glog = logging_utils.GCSLog() self.assertEqual(glog.parse_gcs_url('gs://bucket/path/to/blob'), ('bucket', 'path/to/blob')) # invalid URI self.assertRaises(AirflowException, glog.parse_gcs_url, 'gs:/bucket/path/to/blob') # trailing slash self.assertEqual(glog.parse_gcs_url('gs://bucket/path/to/blob/'), ('bucket', 'path/to/blob')) # bucket only self.assertEqual(glog.parse_gcs_url('gs://bucket/'), ('bucket', ''))
def run(args, dag=None): db_utils.pessimistic_connection_handling() if dag: args.dag_id = dag.dag_id # Setting up logging log_base = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER')) directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args) if not os.path.exists(directory): os.makedirs(directory) iso = args.execution_date.isoformat() filename = "{directory}/{iso}".format(**locals()) logging.root.handlers = [] logging.basicConfig( filename=filename, level=settings.LOGGING_LEVEL, format=settings.LOG_FORMAT) if not args.pickle and not dag: dag = get_dag(args) elif not dag: session = settings.Session() logging.info('Loading pickle id {args.pickle}'.format(**locals())) dag_pickle = session.query( DagPickle).filter(DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) if args.local: print("Logging into: " + filename) run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, force=args.force, pickle_id=args.pickle, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, pool=args.pool) run_job.run() elif args.raw: ti.run( mark_success=args.mark_success, force=args.force, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG session = settings.Session() pickle = DagPickle(dag) session.add(pickle) session.commit() pickle_id = pickle.id print(( 'Pickled dag {dag} ' 'as pickle_id:{pickle_id}').format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = DEFAULT_EXECUTOR executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, force=args.force, pool=args.pool) executor.heartbeat() executor.end() # Force the log to flush, and set the handler to go back to normal so we # don't continue logging to the task's log file. The flush is important # because we subsequently read from the log to insert into S3 or Google # cloud storage. logging.root.handlers[0].flush() logging.root.handlers = [] # store logs remotely remote_base = conf.get('core', 'REMOTE_BASE_LOG_FOLDER') # deprecated as of March 2016 if not remote_base and conf.get('core', 'S3_LOG_FOLDER'): warnings.warn( 'The S3_LOG_FOLDER conf key has been replaced by ' 'REMOTE_BASE_LOG_FOLDER. Your conf still works but please ' 'update airflow.cfg to ensure future compatibility.', DeprecationWarning) remote_base = conf.get('core', 'S3_LOG_FOLDER') if os.path.exists(filename): # read log and remove old logs to get just the latest additions with open(filename, 'r') as logfile: log = logfile.read() remote_log_location = filename.replace(log_base, remote_base) # S3 if remote_base.startswith('s3:/'): logging_utils.S3Log().write(log, remote_log_location) # GCS elif remote_base.startswith('gs:/'): logging_utils.GCSLog().write( log, remote_log_location, append=True) # Other elif remote_base and remote_base != 'None': logging.error( 'Unsupported remote log location: {}'.format(remote_base))