def requires(self):
     if self.required_tasks is None:
         self.required_tasks = {
             'credentials': ExternalURL(url=self.credentials),
             'insert_source_task': self.insert_source_task,
         }
     return self.required_tasks
def get_vertica_results(warehouse_name, credentials, query):
    """Run a single query in Vertica and return the results."""
    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)
    # Externalize autocommit and read timeout
    connection = vertica_python.connect(user=cred.get('username'),
                                        password=cred.get('password'),
                                        host=cred.get('host'),
                                        port=cred.get('port'),
                                        database=warehouse_name,
                                        autocommit=False,
                                        read_timeout=None)

    if not vertica_client_available:
        raise ImportError('Vertica client library not available')

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results
Пример #3
0
    def requires_local(self):
        results = super(ObfuscateCourseEventsTask, self).requires_local()

        if os.path.basename(self.explicit_event_whitelist
                            ) != self.explicit_event_whitelist:
            results['explicit_events'] = ExternalURL(
                url=self.explicit_event_whitelist)
        return results
 def requires(self):
     if self._required_tasks is None:
         self._required_tasks = {
             'credentials':
             ExternalURL(url=self.vertica_credentials),
             'sqoop_dump_vertica_table_task':
             self.sqoop_dump_vertica_table_task,
         }
     return self._required_tasks
Пример #5
0
    def requires(self):
        if self.required_tasks is None:
            self.required_tasks = {
                'credentials': ExternalURL(url=self.credentials),
            }
            if not self.insert_source_task_dynamically:
                self.required_tasks['insert_source'] = self.insert_source_task

        return self.required_tasks
 def requires_local(self):
     """Adds geolocation_data as a local requirement."""
     result = super(GeolocationMixin, self).requires_local()
     # Default is an empty list, but assume that any real data added is done
     # so as a dict.
     if not result:
         result = {}
     result['geolocation_data'] = ExternalURL(self.geolocation_data)
     return result
 def requires(self):
     return {'insert_source': LoadInternalReportingUserActivityToWarehouse(
         n_reduce_tasks=self.n_reduce_tasks,
         date=self.date,
         warehouse_path=self.warehouse_path,
         overwrite=self.overwrite,
         schema=self.schema,
         credentials=self.credentials),
         'credentials': ExternalURL(self.credentials)}
    def manifest_file_list(self):
        """Write each individual path to a manifest file and yield the path to that file."""
        manifest_target = get_target_from_url(self.manifest)
        if not manifest_target.exists():
            with manifest_target.open('w') as manifest_file:
                for external_url_task in self.generate_file_list():
                    manifest_file.write(external_url_task.url + '\n')

        yield ExternalURL(self.manifest)
Пример #9
0
    def requires(self):
        results = {
            'source': CourseEnrollmentChangesPerDay(
                name=self.name,
                src=self.src,
                dest=self.destination,
                include=self.include,
                manifest=self.manifest,
                mapreduce_engine=self.mapreduce_engine,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks
            )
        }
        if self.offsets:
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.statuses:
            results.update({'statuses': ExternalURL(self.statuses)})

        return results
Пример #10
0
    def output(self):
        # TODO: Once VerticaCopyTask handles multiple input files update this
        # to use the outputs of the sub-jobs instead of always returning all
        # files.

        # Affiliate Window reports for each day are stored in dated directories.
        # We want to be able to load all that data into Vertica in one go, hence we use
        # a wildcard('*') here.
        url = url_path_join(self.warehouse_path, 'fees',
                            'affiliate_window') + '/dt=*/'
        return ExternalURL(url=url).output()
 def requires(self):
     return {
         'source':
         LoadWarehouseTask(date=self.date,
                           schema=self.schema,
                           credentials=self.credentials,
                           marker_schema=self.marker_schema,
                           overwrite=self.overwrite,
                           n_reduce_tasks=self.n_reduce_tasks),
         'credentials':
         ExternalURL(self.credentials)
     }
Пример #12
0
    def requires(self):
        yield ExternalURL(url=self.vertica_credentials)

        yield ExternalURL(url=self.gcp_credentials)

        if self.bigquery_dataset is None:
            self.bigquery_dataset = self.vertica_schema_name

        for table_name in self.get_table_list_for_schema():
            yield LoadVerticaTableFromS3ToBigQueryTask(
                date=self.date,
                overwrite=self.overwrite,
                intermediate_warehouse_path=self.intermediate_warehouse_path,
                dataset_id=self.bigquery_dataset,
                credentials=self.gcp_credentials,
                max_bad_records=self.max_bad_records,
                table_name=table_name,
                vertica_schema_name=self.vertica_schema_name,
                vertica_warehouse_name=self.vertica_warehouse_name,
                vertica_credentials=self.vertica_credentials,
            )
Пример #13
0
    def insert_source_task(self):
        """
        We are already exporting vertica tables to S3 using SqoopImportFromVertica through VerticaSchemaToBigQueryTask
        workflow, so we specify ExternalURL here instead. In the future we can change this to a
        SqoopImportFromVertica task.
        """
        partition_path_spec = HivePartition('dt', self.date).path_spec
        intermediate_warehouse_path = url_path_join(self.warehouse_path,
                                                    'import/vertica/sqoop/')
        url = url_path_join(intermediate_warehouse_path,
                            self.vertica_warehouse_name,
                            self.vertica_schema_name, self.table_name,
                            partition_path_spec) + '/'

        return ExternalURL(url=url)
    def requires(self):
        yield ExternalURL(url=self.vertica_credentials)

        for table_name in self.get_table_list_for_schema():
            yield ExportVerticaTableToS3Task(
                date=self.date,
                overwrite=self.overwrite,
                table_name=table_name,
                intermediate_warehouse_path=self.intermediate_warehouse_path,
                vertica_schema_name=self.vertica_schema_name,
                vertica_warehouse_name=self.vertica_warehouse_name,
                vertica_credentials=self.vertica_credentials,
                sqoop_null_string=self.sqoop_null_string,
                sqoop_fields_terminated_by=self.sqoop_fields_terminated_by,
                sqoop_delimiter_replacement=self.sqoop_delimiter_replacement,
            )
Пример #15
0
    def requires(self):
        config = get_config()
        for merchant_id in self.cybersource_merchant_ids:
            section_name = 'cybersource:' + merchant_id
            interval_start = luigi.DateParameter().parse(config.get(section_name, 'interval_start'))
            interval_end = self.import_date

            merchant_close_date = config.get(section_name, 'merchant_close_date', '')
            if merchant_close_date:
                parsed_date = luigi.DateParameter().parse(merchant_close_date)
                interval_end = min(self.import_date, parsed_date)

            cybersource_interval = date_interval.Custom(interval_start, interval_end)

            for date in cybersource_interval:
                filename = "cybersource_{}.tsv".format(merchant_id)
                url = url_path_join(self.warehouse_path, 'payments', 'dt=' + date.isoformat(), filename)
                yield ExternalURL(url=url)
Пример #16
0
    def get_downstream_task(self):
        # If no downstream task has been set, load our configuration and generate our tasks and dependency chain.
        if self.downstream_task is None:
            script_conf_target = ExternalURL(
                url=self.script_configuration).output()
            with script_conf_target.open('r') as script_conf_file:
                config = yaml.safe_load(script_conf_file)
                if config is not None and isinstance(config, dict):
                    previous_task = None

                    scripts = config.get('scripts', [])

                    # Iterate over the list of scripts in the configuration file in reverse order.  We also zip a list of integers,
                    # representing the zero-based index position of the given script in the overall list.  We iterate in reverse
                    # in order to link each task together, using requires(), to ensure that tasks run sequentially, and in the intended
                    # order: from the top of the file, downwards.
                    for script in scripts:
                        if not self.validate_script_entry(script):
                            log.warn("encountered invalid script entry!")
                            continue

                        new_task = RunVerticaSqlScriptTask(
                            credentials=self.credentials,
                            schema=self.schema,
                            marker_schema=self.marker_schema,
                            date=self.date,
                            read_timeout=self.read_timeout,
                            source_script=path.join(self.script_root,
                                                    script['location']),
                            script_name=script.get('name'))

                        # If we previously configured a task, set it as a dependency of this one, so it runs prior to.
                        if previous_task is not None:
                            new_task.add_dependency(previous_task)

                        # Mark this as the previously-created task.
                        previous_task = new_task

                    self.downstream_task = previous_task

        # If a downstream task has been set, yield it, triggering Luigi to schedule our scripts.
        if self.downstream_task is not None:
            yield self.downstream_task
Пример #17
0
    def requires(self):
        results = {
            'events': ProblemCheckEvent(
                mapreduce_engine=self.mapreduce_engine,
                input_format=self.base_input_format,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks,
                name=self.name,
                src=self.src,
                dest=self.dest,
                include=self.include,
                manifest=self.manifest,
            ),
        }

        if self.answer_metadata:
            results.update({'answer_metadata': ExternalURL(self.answer_metadata)})

        return results
    def requires(self):
        credentials_target = ExternalURL(url=self.google_credentials).output()
        gs = create_google_spreadsheet_client(credentials_target)
        for spreadsheet_key, config in self.spreadsheets_config.items():
            schema = config['schema']
            column_types_row = config.get('column_types_row', False)

            spreadsheet = gs.open_by_key(spreadsheet_key)
            worksheets = spreadsheet.worksheets()

            for worksheet in worksheets:
                yield LoadWorksheetToVertica(
                    date=self.date,
                    schema=schema,
                    google_credentials=self.google_credentials,
                    spreadsheet_key=spreadsheet_key,
                    worksheet_name=worksheet.title,
                    column_types_row=column_types_row,
                    overwrite=self.overwrite,
                )
 def requires(self):
     end_date = self.date + timedelta(1)
     results = {
         'enrollments':
         CourseEnrollmentChangesPerDay(
             name=self.name,
             src=self.src,
             dest=self.destination,
             include=self.include,
             manifest=self.manifest,
             mapreduce_engine=self.mapreduce_engine,
             lib_jar=self.lib_jar,
             n_reduce_tasks=self.n_reduce_tasks),
         'registrations':
         UserRegistrationsPerDay(credentials=self.credentials,
                                 destination=self.destination,
                                 date_interval=Custom(
                                     MINIMUM_DATE, end_date)),
     }
     if self.blacklist:
         results.update({'blacklist': ExternalURL(self.blacklist)})
     return results
    def requires(self):
        yield ExternalURL(url=self.vertica_credentials)

        for table_name in self.get_table_list_for_schema():
            yield LoadVerticaTableFromS3ToSnowflakeTask(
                date=self.date,
                overwrite=self.overwrite,
                intermediate_warehouse_path=self.intermediate_warehouse_path,
                credentials=self.credentials,
                warehouse=self.warehouse,
                role=self.role,
                sf_database=self.sf_database,
                schema=self.schema,
                scratch_schema=self.scratch_schema,
                run_id=self.run_id,
                table_name=table_name,
                vertica_schema_name=self.vertica_schema_name,
                vertica_warehouse_name=self.vertica_warehouse_name,
                vertica_credentials=self.vertica_credentials,
                sqoop_null_string=self.sqoop_null_string,
                sqoop_fields_terminated_by=self.sqoop_fields_terminated_by,
            )
Пример #21
0
def get_mysql_query_results(credentials, database, query):
    """
    Executes a mysql query on the provided database and returns the results.
    """

    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)

    connection = mysql.connector.connect(user=cred.get('username'),
                                         password=cred.get('password'),
                                         host=cred.get('host'),
                                         port=cred.get('port'),
                                         database=database)

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results
    def __init__(self, *args, **kwargs):
        super(ImportMysqlDatabaseToBigQueryDatasetTask, self).__init__(*args, **kwargs)
        self.table_list = []
        self.is_complete = False
        self.required_tasks = None

        # If we are overwriting the database output, then delete the entire marker table.
        # That way, any future activity on it should only consist of inserts, rather than any deletes
        # of existing marker entries.  There are quotas on deletes and upserts on a table, of no more
        # than 96 per day.   This allows us to work around hitting those limits.
        # Note that we have to do this early, before scheduling begins, so that no entries are present
        # when scheduling occurs (so everything gets properly scheduled).
        if self.overwrite:
            # First, create a BigQueryTarget object, so we can connect to BigQuery.  This is only
            # for the purpose of deleting the marker table, so use dummy values.
            credentials_target = ExternalURL(url=self.credentials).output()
            target = BigQueryTarget(
                credentials_target=credentials_target,
                dataset_id=self.dataset_id,
                table="dummy_table",
                update_id="dummy_id",
            )
            # Now ask it to delete the marker table completely.
            target.delete_marker_table()
Пример #23
0
 def insert_source_task(self):
     hive_table = "internal_reporting_user_activity"
     partition_location = url_path_join(self.warehouse_path, hive_table,
                                        self.partition.path_spec) + '/'
     return ExternalURL(url=partition_location)
 def requires_local(self):
     return ExternalURL(url=self.events_list_file_path)
Пример #25
0
 def requires(self):
     return {
         'credentials': ExternalURL(url=self.credentials),
     }
Пример #26
0
 def requires_local(self):
     return ExternalURL(url=self.config)
Пример #27
0
    def requires(self):
        yield self.hive_table_task

        yield ExternalURL(
            url=url_path_join(self.warehouse_path, 'course_enrollment_summary', self.partition.path_spec) + '/'
        )
 def insert_source_task(self):
     url = url_path_join(
         self.hive_partition_path('course_subject', self.date),
         'course_subject.tsv')
     return ExternalURL(url=url)
 def insert_source_task(self):
     return ExternalURL(url=self.hive_partition_path(
         'internal_reporting_d_country', self.date))
 def insert_source_task(self):
     return ExternalURL(url=self.hive_partition_path(
         'course_enrollment_summary', self.date))