def __init__(self, dag: DAG): self.dag_id = dag.dag_id self.fileloc = dag.full_filepath self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.data = SerializedDAG.to_dict(dag) self.last_updated = timezone.utcnow() self.dag_hash = hashlib.md5(json.dumps(self.data, sort_keys=True).encode("utf-8")).hexdigest()
def upgrade(): """Apply add source code table""" op.create_table( 'dag_code', # pylint: disable=no-member sa.Column('fileloc_hash', sa.BigInteger(), nullable=False, primary_key=True, autoincrement=False), sa.Column('fileloc', sa.String(length=2000), nullable=False), sa.Column('source_code', sa.UnicodeText(), nullable=False), sa.Column('last_updated', sa.TIMESTAMP(timezone=True), nullable=False)) conn = op.get_bind() if conn.dialect.name not in ('sqlite'): op.drop_index('idx_fileloc_hash', 'serialized_dag') op.alter_column(table_name='serialized_dag', column_name='fileloc_hash', type_=sa.BigInteger(), nullable=False) op.create_index( # pylint: disable=no-member 'idx_fileloc_hash', 'serialized_dag', ['fileloc_hash']) sessionmaker = sa.orm.sessionmaker() session = sessionmaker(bind=conn) serialized_dags = session.query(SerializedDagModel).all() for dag in serialized_dags: dag.fileloc_hash = DagCode.dag_fileloc_hash(dag.fileloc) session.merge(dag) session.commit()
def delete_workflow(self, project_name: Text, workflow_name: Text) -> Optional[WorkflowInfo]: dag_id = self.airflow_dag_id(project_name, workflow_name) if not self.dag_exist(dag_id): return None deploy_path = self.config.properties().get('airflow_deploy_path') if deploy_path is None: raise Exception("airflow_deploy_path config not set!") airflow_file_path = os.path.join(deploy_path, dag_id + '.py') if os.path.exists(airflow_file_path): os.remove(airflow_file_path) # stop all workflow executions self.kill_all_workflow_execution(project_name, workflow_name) # clean db meta with create_session() as session: dag = session.query(DagModel).filter( DagModel.dag_id == dag_id).first() session.query(DagTag).filter(DagTag.dag_id == dag_id).delete() session.query(DagModel).filter(DagModel.dag_id == dag_id).delete() session.query(DagCode).filter( DagCode.fileloc_hash == DagCode.dag_fileloc_hash( dag.fileloc)).delete() session.query(SerializedDagModel).filter( SerializedDagModel.dag_id == dag_id).delete() session.query(DagRun).filter(DagRun.dag_id == dag_id).delete() session.query(TaskState).filter( TaskState.dag_id == dag_id).delete() session.query(TaskInstance).filter( TaskInstance.dag_id == dag_id).delete() session.query(TaskExecution).filter( TaskExecution.dag_id == dag_id).delete() return WorkflowInfo(namespace=project_name, workflow_name=workflow_name)
def __init__(self, dag: DAG): self.dag_id = dag.dag_id self.fileloc = dag.full_filepath self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.data = SerializedDAG.to_dict(dag) self.last_updated = timezone.utcnow() dag_event_deps = DagEventDependencies(dag) self.event_relationships = DagEventDependencies.to_json(dag_event_deps) self.dag_hash = hashlib.md5( json.dumps(self.data, sort_keys=True).encode("utf-8")).hexdigest()
def _compare_example_dags(self, example_dags): with create_session() as session: for dag in example_dags.values(): self.assertTrue(DagCode.has_dag(dag.fileloc)) dag_fileloc_hash = DagCode.dag_fileloc_hash(dag.fileloc) result = session.query( DagCode.fileloc, DagCode.fileloc_hash, DagCode.source_code) \ .filter(DagCode.fileloc == dag.fileloc) \ .filter(DagCode.fileloc_hash == dag_fileloc_hash) \ .one() self.assertEqual(result.fileloc, dag.fileloc) with open_maybe_zipped(dag.fileloc, 'r') as source: source_code = source.read() self.assertEqual(result.source_code, source_code)
def upgrade(): """Create DagCode Table.""" from sqlalchemy.ext.declarative import declarative_base Base = declarative_base() class SerializedDagModel(Base): __tablename__ = 'serialized_dag' # There are other columns here, but these are the only ones we need for the SELECT/UPDATE we are doing dag_id = sa.Column(sa.String(250), primary_key=True) fileloc = sa.Column(sa.String(2000), nullable=False) fileloc_hash = sa.Column(sa.BigInteger, nullable=False) """Apply add source code table""" op.create_table( 'dag_code', # pylint: disable=no-member sa.Column('fileloc_hash', sa.BigInteger(), nullable=False, primary_key=True, autoincrement=False), sa.Column('fileloc', sa.String(length=2000), nullable=False), sa.Column('source_code', sa.UnicodeText(), nullable=False), sa.Column('last_updated', sa.TIMESTAMP(timezone=True), nullable=False), ) conn = op.get_bind() if conn.dialect.name != 'sqlite': if conn.dialect.name == "mssql": op.drop_index('idx_fileloc_hash', 'serialized_dag') op.alter_column(table_name='serialized_dag', column_name='fileloc_hash', type_=sa.BigInteger(), nullable=False) if conn.dialect.name == "mssql": op.create_index('idx_fileloc_hash', 'serialized_dag', ['fileloc_hash']) sessionmaker = sa.orm.sessionmaker() session = sessionmaker(bind=conn) serialized_dags = session.query(SerializedDagModel).all() for dag in serialized_dags: dag.fileloc_hash = DagCode.dag_fileloc_hash(dag.fileloc) session.merge(dag) session.commit()
def remove_deleted_dags(cls, alive_dag_filelocs: List[str], session=None): """Deletes DAGs not included in alive_dag_filelocs. :param alive_dag_filelocs: file paths of alive DAGs :param session: ORM Session """ alive_fileloc_hashes = [ DagCode.dag_fileloc_hash(fileloc) for fileloc in alive_dag_filelocs ] log.debug( "Deleting Serialized DAGs (for which DAG files are deleted) from %s table ", cls.__tablename__) session.execute(cls.__table__.delete().where( and_(cls.fileloc_hash.notin_(alive_fileloc_hashes), cls.fileloc.notin_(alive_dag_filelocs))))
def _compare_example_dags(self, example_dags): with create_session() as session: for dag in example_dags.values(): if dag.is_subdag: dag.fileloc = dag.parent_dag.fileloc assert DagCode.has_dag(dag.fileloc) dag_fileloc_hash = DagCode.dag_fileloc_hash(dag.fileloc) result = ( session.query(DagCode.fileloc, DagCode.fileloc_hash, DagCode.source_code) .filter(DagCode.fileloc == dag.fileloc) .filter(DagCode.fileloc_hash == dag_fileloc_hash) .one() ) assert result.fileloc == dag.fileloc with open_maybe_zipped(dag.fileloc, 'r') as source: source_code = source.read() assert result.source_code == source_code
def __init__(self, dag: DAG): self.dag_id = dag.dag_id self.fileloc = dag.fileloc self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.last_updated = timezone.utcnow() dag_data = SerializedDAG.to_dict(dag) dag_data_json = json.dumps(dag_data, sort_keys=True).encode("utf-8") self.dag_hash = hashlib.md5(dag_data_json).hexdigest() if COMPRESS_SERIALIZED_DAGS: self._data = None self._data_compressed = zlib.compress(dag_data_json) else: self._data = dag_data self._data_compressed = None # serve as cache so no need to decompress and load, when accessing data field # when COMPRESS_SERIALIZED_DAGS is True self.__data_cache = dag_data
def test_dag_fileloc_hash(self): """Verifies the correctness of hashing file path.""" self.assertEqual(DagCode.dag_fileloc_hash('/airflow/dags/test_dag.py'), 33826252060516589)
def __init__(self, dag): self.dag_id = dag.dag_id self.fileloc = dag.full_filepath self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.data = SerializedDAG.to_dict(dag) self.last_updated = timezone.utcnow()