def execute(self, context): """ Executed by task_instance at runtime """ mongo_conn = MongoHook(self.mongo_conn_id).get_conn() s3_conn = S3Hook(self.s3_conn_id) # Grab collection and execute query according to whether or not it is a pipeline collection = mongo_conn.get_database(self.mongo_db).get_collection( self.mongo_collection) results = collection.aggregate( self.mongo_query) if self.is_pipeline else collection.find( self.mongo_query) # Performs transform then stringifies the docs results into json format # docs_str = self._stringify(self.transform(results)) tmp_file = NamedTemporaryFile() print("writing results to temp file") start = datetime.now() with jsonlines.Writer(tmp_file) as writer: writer.write_all(results) tmp_file.close() end = datetime.now() print("took %i seconds" % (end - start).total_seconds()) s3_conn.load_file(tmp_file.name, self.s3_key, bucket_name=self.s3_bucket, replace=self.replace) os.unlink(tmp_file.name)
def __init__(self, mongo_conn_id, mongo_database='test', mongo_colletion='colls', mongo_query={}, task=None, *args, **kwargs): super(ProcessTaskOperator, self).__init__(*args, **kwargs) # Conn Ids self.mongo_conn_id = mongo_conn_id self.mongo_database = mongo_database self.mongo_colletion = mongo_colletion self.mongo_query = mongo_query self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn() #karakuri collections self.coll_queue = self.mongo_conn.get_database( 'karakuri').get_collection('queue') self.coll_users = self.mongo_conn.get_database( 'karakuri').get_collection('users') self.coll_issues = self.mongo_conn.get_database( 'support').get_collection('issues') self.coll_workflows = self.mongo_conn.get_database( 'karakuri').get_collection('workflows') # self.task = task
def execute(self, context): """ Executed by task_instance at runtime """ mongo_conn = MongoHook(self.mongo_conn_id).get_conn() # Grab collection and execute query according to whether or not it is a pipeline collection = mongo_conn.get_database(self.mongo_db).get_collection( self.mongo_collection) collection.aggregate(self.mongo_query, **self.mongo_kwargs)
def __init__(self, mongo_conn_id, mongo_database='test', mongo_colletion='colls', mongo_query={}, *args, **kwargs): super(GetWorkflowOperator, self).__init__(*args, **kwargs) # Conn Ids self.mongo_conn_id = mongo_conn_id self.mongo_database = mongo_database self.mongo_colletion = mongo_colletion self.mongo_query = mongo_query self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn()
def __init__(self, mongo_collection, s3_conn_id, s3_bucket, s3_key, mongo_conn_id='mongo_default', replace=False, mongo_query=None, mongo_fields=None, mongo_extra_params=None, xcom_push=False, env=None, output_encoding='utf-8', *args, **kwargs): mongo_uri = MongoHook(mongo_conn_id).get_uri() super(BashOperator, self).__init__(*args, **kwargs) self.mongo_uri = mongo_uri self.mongo_collection = mongo_collection self.mongo_fields = mongo_fields or [] self.mongo_extra_params = mongo_extra_params or [] self.mongo_query = mongo_query or {} self.env = env self.xcom_push_flag = xcom_push self.output_encoding = output_encoding # S3 Settings self.s3_bucket = s3_bucket self.s3_key = s3_key self.s3_conn_id = s3_conn_id self.replace = replace
def execute(self, context): s3 = S3Hook(self.s3_conn_id) mongo = MongoHook(conn_id=self.mongo_conn_id) data = (s3.get_key(self.s3_key, bucket_name=self.s3_bucket).get_contents_as_string( encoding='utf-8')) docs = [json.loads(doc) for doc in data.split('\n')] self.method_mapper(mongo, docs)
def subdag_tasks(parent_dag_name, child_dag_name, args): dag_subdag = DAG( dag_id='%s.%s' % (parent_dag_name, child_dag_name), default_args=args, schedule_interval="@daily", ) mongo_conn = MongoHook('mongo_default').get_conn() workflow = mongo_conn.get_database('karakuri').get_collection( 'workflows').find_one({"name": "SFSC review: new airflow"}) workflow_id = workflow.get('_id') tasks = mongo_conn.get_database('karakuri').get_collection('queue').find({ "active": True, "approved": True, "inProg": False, "done": False, "approvedBy": "karakuri", "workflow": workflow_id }) #static creation of known number of tasks option, reading the db generate perf issue which makes # impossible to parallelise the subdagging # tasks = [{'key': '001234567'},{'key': '001234567'},{'key': '001234567'}] for task in tasks: ProcessTaskOperator(task_id='%s-task-%s' % (child_dag_name, task.get('key', 'error')), mongo_conn_id="mongo_default", mongo_database="karakuri", mongo_colletion="workflows", task=task, dag=dag_subdag) return dag_subdag
def execute(self, context): """ Executed by task_instance at runtime """ mongo_conn = MongoHook(self.mongo_conn_id).get_conn() s3_conn = S3Hook(self.s3_conn_id) # Grab collection and execute query according to whether or not it is a pipeline collection = mongo_conn.get_database(self.mongo_db).get_collection( self.mongo_collection) results = collection.aggregate( self.mongo_query) if self.is_pipeline else collection.find( self.mongo_query) # Performs transform then stringifies the docs results into json format docs_str = self._stringify(self.transform(results)) s3_conn.load_string(docs_str, self.s3_key, bucket_name=self.s3_bucket, replace=self.replace)
class GetWorkflowOperator(BaseOperator): def __init__(self, mongo_conn_id, mongo_database='test', mongo_colletion='colls', mongo_query={}, *args, **kwargs): super(GetWorkflowOperator, self).__init__(*args, **kwargs) # Conn Ids self.mongo_conn_id = mongo_conn_id self.mongo_database = mongo_database self.mongo_colletion = mongo_colletion self.mongo_query = mongo_query self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn() def execute(self, context): collection = self.mongo_conn.get_database( self.mongo_database).get_collection(self.mongo_colletion) result = collection.find_one(self.mongo_query) return result
class GetWorkflowDocsOperator(BaseOperator): def __init__(self, mongo_conn_id, mongo_database = 'test', mongo_colletion = 'colls', mongo_query = {}, *args, **kwargs): super(GetWorkflowDocsOperator, self).__init__(*args, **kwargs) # Conn Ids self.mongo_conn_id = mongo_conn_id self.mongo_database = mongo_database self.mongo_colletion = mongo_colletion self.mongo_query = mongo_query self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn() #karakuri collections self.coll_queue = self.mongo_conn.get_database('karakuri').get_collection('queue') self.coll_users = self.mongo_conn.get_database('karakuri').get_collection('users') self.coll_issues = self.mongo_conn.get_database('support').get_collection('issues') self.coll_workflows = self.mongo_conn.get_database('karakuri').get_collection('workflows') def execute(self, context): print("CONTEXT: ", context) workflow = self.pull_workflow(context) args = self.create_args() jira = Jirapp(args, self.mongo_conn) jira.set_live(args['live']) # Initialize SFDC++ sfdc = Sfdcpp(args['sfdc_username'], args['sfdc_password'], args['sfdc_server'], args['sfdc_schemaversion']) sfdc.set_live(args['live']) # Set the Issuer. There can be only one: # https://www.youtube.com/watch?v=sqcLjcSloXs issuer = jira user = {'sudoUser': '******'} print("CREATE KARAKURI") karakuri = Karakuri(args, jira, sfdc, issuer, self.mongo_conn) result = karakuri.findWorkflowDocs(workflow, sudoUser='******', userDoc=self.authentincate()) print("RESULTS: ", result) return result def authentincate(self): try: user = self.coll_users.find_one({'user': '******'}) return user except Exception: logger.error("Abort - Failed to read users collection") def pull_workflow(self, context): value = context['task_instance'].xcom_pull(task_ids='get_workflow_by_name') print("pull_workflow: ", value) return value def create_args(self): cert = bytes("RSA PRIVATE KEY", "utf-8").decode("unicode_escape") args = {'live': False, 'jira_key_cert': cert, 'jira_access_token': 'jira_access_token', 'jira_access_token_secret': 'jira_access_token_secret', 'jira_consumer_key': 'jira_consumer_key', 'jira_server': "jira_server", 'sfdc_username': '******', 'sfdc_password': '******', 'sfdc_server': "sfdc_server", 'sfdc_schemaversion': "35.0", 'log_level': 'DEBUG', 'log': '/home/root/airflow/logs'} return args def transform(self, docs): """ Processes pyMongo cursor and returns single array with each element being a JSON serializable dictionary MongoToS3Operator.transform() assumes no processing is needed ie. docs is a pyMongo cursor of documents and cursor just needs to be converted into an array. """ return [doc for doc in docs]
# The DAG object; we'll need this to instantiate a DAG from airflow import DAG # Operators; we need this to operate! from datetime import datetime, timedelta from mongo_plugin.hooks.mongo_hook import MongoHook from airflow.operators import (DummyOperator, SubDagOperator) from subdags.subdag_task import subdag_tasks default_args = {'owner': 'airflow', 'start_date': datetime(2018, 9, 12)} mongo_conn = MongoHook('mongo_default').get_conn() workflow = mongo_conn.get_database('karakuri').get_collection( 'workflows').find_one({"name": "substituto_real_workflow_name"}) workflow_id = workflow.get('_id') tasks = mongo_conn.get_database('karakuri').get_collection('queue').find({ "active": True, "approved": True, "inProg": False, "done": False, "approvedBy": "karakuri", "workflow": workflow_id }) print("TASKS: ", tasks)