Пример #1
0
    def execute(self, context):
        """
        Executed by task_instance at runtime
        """
        mongo_conn = MongoHook(self.mongo_conn_id).get_conn()
        s3_conn = S3Hook(self.s3_conn_id)

        # Grab collection and execute query according to whether or not it is a pipeline
        collection = mongo_conn.get_database(self.mongo_db).get_collection(
            self.mongo_collection)
        results = collection.aggregate(
            self.mongo_query) if self.is_pipeline else collection.find(
                self.mongo_query)

        # Performs transform then stringifies the docs results into json format
        # docs_str = self._stringify(self.transform(results))
        tmp_file = NamedTemporaryFile()
        print("writing results to temp file")
        start = datetime.now()
        with jsonlines.Writer(tmp_file) as writer:
            writer.write_all(results)
        tmp_file.close()
        end = datetime.now()
        print("took %i seconds" % (end - start).total_seconds())

        s3_conn.load_file(tmp_file.name,
                          self.s3_key,
                          bucket_name=self.s3_bucket,
                          replace=self.replace)
        os.unlink(tmp_file.name)
Пример #2
0
 def __init__(self,
              mongo_conn_id,
              mongo_database='test',
              mongo_colletion='colls',
              mongo_query={},
              task=None,
              *args,
              **kwargs):
     super(ProcessTaskOperator, self).__init__(*args, **kwargs)
     # Conn Ids
     self.mongo_conn_id = mongo_conn_id
     self.mongo_database = mongo_database
     self.mongo_colletion = mongo_colletion
     self.mongo_query = mongo_query
     self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn()
     #karakuri collections
     self.coll_queue = self.mongo_conn.get_database(
         'karakuri').get_collection('queue')
     self.coll_users = self.mongo_conn.get_database(
         'karakuri').get_collection('users')
     self.coll_issues = self.mongo_conn.get_database(
         'support').get_collection('issues')
     self.coll_workflows = self.mongo_conn.get_database(
         'karakuri').get_collection('workflows')
     #
     self.task = task
Пример #3
0
    def execute(self, context):
        """
        Executed by task_instance at runtime
        """
        mongo_conn = MongoHook(self.mongo_conn_id).get_conn()

        # Grab collection and execute query according to whether or not it is a pipeline
        collection = mongo_conn.get_database(self.mongo_db).get_collection(
            self.mongo_collection)
        collection.aggregate(self.mongo_query, **self.mongo_kwargs)
Пример #4
0
 def __init__(self,
              mongo_conn_id,
              mongo_database='test',
              mongo_colletion='colls',
              mongo_query={},
              *args,
              **kwargs):
     super(GetWorkflowOperator, self).__init__(*args, **kwargs)
     # Conn Ids
     self.mongo_conn_id = mongo_conn_id
     self.mongo_database = mongo_database
     self.mongo_colletion = mongo_colletion
     self.mongo_query = mongo_query
     self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn()
Пример #5
0
    def __init__(self,
                 mongo_collection,
                 s3_conn_id,
                 s3_bucket,
                 s3_key,
                 mongo_conn_id='mongo_default',
                 replace=False,
                 mongo_query=None,
                 mongo_fields=None,
                 mongo_extra_params=None,
                 xcom_push=False,
                 env=None,
                 output_encoding='utf-8',
                 *args,
                 **kwargs):

        mongo_uri = MongoHook(mongo_conn_id).get_uri()
        super(BashOperator, self).__init__(*args, **kwargs)

        self.mongo_uri = mongo_uri
        self.mongo_collection = mongo_collection
        self.mongo_fields = mongo_fields or []
        self.mongo_extra_params = mongo_extra_params or []
        self.mongo_query = mongo_query or {}

        self.env = env
        self.xcom_push_flag = xcom_push
        self.output_encoding = output_encoding
        # S3 Settings
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.s3_conn_id = s3_conn_id
        self.replace = replace
Пример #6
0
    def execute(self, context):
        s3 = S3Hook(self.s3_conn_id)
        mongo = MongoHook(conn_id=self.mongo_conn_id)

        data = (s3.get_key(self.s3_key,
                           bucket_name=self.s3_bucket).get_contents_as_string(
                               encoding='utf-8'))

        docs = [json.loads(doc) for doc in data.split('\n')]

        self.method_mapper(mongo, docs)
Пример #7
0
def subdag_tasks(parent_dag_name, child_dag_name, args):
    dag_subdag = DAG(
        dag_id='%s.%s' % (parent_dag_name, child_dag_name),
        default_args=args,
        schedule_interval="@daily",
    )

    mongo_conn = MongoHook('mongo_default').get_conn()

    workflow = mongo_conn.get_database('karakuri').get_collection(
        'workflows').find_one({"name": "SFSC review: new airflow"})
    workflow_id = workflow.get('_id')
    tasks = mongo_conn.get_database('karakuri').get_collection('queue').find({
        "active":
        True,
        "approved":
        True,
        "inProg":
        False,
        "done":
        False,
        "approvedBy":
        "karakuri",
        "workflow":
        workflow_id
    })

    #static creation of known number of tasks option, reading the db generate perf issue which makes
    # impossible to parallelise the subdagging
    # tasks = [{'key': '001234567'},{'key': '001234567'},{'key': '001234567'}]

    for task in tasks:
        ProcessTaskOperator(task_id='%s-task-%s' %
                            (child_dag_name, task.get('key', 'error')),
                            mongo_conn_id="mongo_default",
                            mongo_database="karakuri",
                            mongo_colletion="workflows",
                            task=task,
                            dag=dag_subdag)

    return dag_subdag
Пример #8
0
    def execute(self, context):
        """
        Executed by task_instance at runtime
        """
        mongo_conn = MongoHook(self.mongo_conn_id).get_conn()
        s3_conn = S3Hook(self.s3_conn_id)

        # Grab collection and execute query according to whether or not it is a pipeline
        collection = mongo_conn.get_database(self.mongo_db).get_collection(
            self.mongo_collection)
        results = collection.aggregate(
            self.mongo_query) if self.is_pipeline else collection.find(
                self.mongo_query)

        # Performs transform then stringifies the docs results into json format
        docs_str = self._stringify(self.transform(results))

        s3_conn.load_string(docs_str,
                            self.s3_key,
                            bucket_name=self.s3_bucket,
                            replace=self.replace)
Пример #9
0
class GetWorkflowOperator(BaseOperator):
    def __init__(self,
                 mongo_conn_id,
                 mongo_database='test',
                 mongo_colletion='colls',
                 mongo_query={},
                 *args,
                 **kwargs):
        super(GetWorkflowOperator, self).__init__(*args, **kwargs)
        # Conn Ids
        self.mongo_conn_id = mongo_conn_id
        self.mongo_database = mongo_database
        self.mongo_colletion = mongo_colletion
        self.mongo_query = mongo_query
        self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn()

    def execute(self, context):
        collection = self.mongo_conn.get_database(
            self.mongo_database).get_collection(self.mongo_colletion)
        result = collection.find_one(self.mongo_query)
        return result
Пример #10
0
class GetWorkflowDocsOperator(BaseOperator):

    def __init__(self,
                 mongo_conn_id,
                 mongo_database = 'test',
                 mongo_colletion = 'colls',
                 mongo_query = {},
                 *args, **kwargs):
        super(GetWorkflowDocsOperator, self).__init__(*args, **kwargs)
        # Conn Ids
        self.mongo_conn_id = mongo_conn_id
        self.mongo_database = mongo_database
        self.mongo_colletion = mongo_colletion
        self.mongo_query = mongo_query
        self.mongo_conn = MongoHook(self.mongo_conn_id).get_conn()
        #karakuri collections
        self.coll_queue = self.mongo_conn.get_database('karakuri').get_collection('queue')
        self.coll_users = self.mongo_conn.get_database('karakuri').get_collection('users')
        self.coll_issues = self.mongo_conn.get_database('support').get_collection('issues')
        self.coll_workflows = self.mongo_conn.get_database('karakuri').get_collection('workflows')

    def execute(self, context):
        print("CONTEXT: ", context)
        workflow = self.pull_workflow(context)
        args = self.create_args()
        jira = Jirapp(args, self.mongo_conn)
        jira.set_live(args['live'])
        # Initialize SFDC++
        sfdc = Sfdcpp(args['sfdc_username'], args['sfdc_password'],
                      args['sfdc_server'], args['sfdc_schemaversion'])
        sfdc.set_live(args['live'])
        # Set the Issuer. There can be only one:
        # https://www.youtube.com/watch?v=sqcLjcSloXs
        issuer = jira
        user = {'sudoUser': '******'}

        print("CREATE KARAKURI")
        karakuri = Karakuri(args, jira, sfdc, issuer, self.mongo_conn)
        result = karakuri.findWorkflowDocs(workflow, sudoUser='******', userDoc=self.authentincate())
        print("RESULTS: ", result)
        return result

    def authentincate(self):
        try:
            user = self.coll_users.find_one({'user': '******'})
            return user
        except Exception:
            logger.error("Abort - Failed to read users collection")


    def pull_workflow(self, context):
        value = context['task_instance'].xcom_pull(task_ids='get_workflow_by_name')
        print("pull_workflow: ", value)
        return value

    def create_args(self):
        cert = bytes("RSA PRIVATE KEY", "utf-8").decode("unicode_escape")
        args = {'live': False,
                'jira_key_cert': cert,
                'jira_access_token': 'jira_access_token',
                'jira_access_token_secret': 'jira_access_token_secret',
                'jira_consumer_key': 'jira_consumer_key',
                'jira_server': "jira_server",
                'sfdc_username': '******',
                'sfdc_password': '******',
                'sfdc_server': "sfdc_server",
                'sfdc_schemaversion': "35.0",
                'log_level': 'DEBUG',
                'log': '/home/root/airflow/logs'}
        return args

    def transform(self, docs):
        """
        Processes pyMongo cursor and returns single array with each element being
                a JSON serializable dictionary
        MongoToS3Operator.transform() assumes no processing is needed
        ie. docs is a pyMongo cursor of documents and cursor just needs to be
            converted into an array.
        """
        return [doc for doc in docs]
# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG
# Operators; we need this to operate!
from datetime import datetime, timedelta
from mongo_plugin.hooks.mongo_hook import MongoHook
from airflow.operators import (DummyOperator, SubDagOperator)
from subdags.subdag_task import subdag_tasks

default_args = {'owner': 'airflow', 'start_date': datetime(2018, 9, 12)}

mongo_conn = MongoHook('mongo_default').get_conn()

workflow = mongo_conn.get_database('karakuri').get_collection(
    'workflows').find_one({"name": "substituto_real_workflow_name"})
workflow_id = workflow.get('_id')
tasks = mongo_conn.get_database('karakuri').get_collection('queue').find({
    "active":
    True,
    "approved":
    True,
    "inProg":
    False,
    "done":
    False,
    "approvedBy":
    "karakuri",
    "workflow":
    workflow_id
})

print("TASKS: ", tasks)