def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = ScalableS3Client().s3
             for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                         yield ExternalURL(filepath)
    def requires(self):
        yield ExternalURL(url=self.vertica_credentials)
        yield ExternalURL(url=self.gcp_credentials)

        if self.bigquery_dataset is None:
            self.bigquery_dataset = self.vertica_schema_name

        intermediate_warehouse_path = url_path_join(self.s3_warehouse_path,
                                                    'import/vertica/sqoop/')

        query = "SELECT table_name FROM all_tables WHERE schema_name='{schema_name}' AND table_type='TABLE' " \
                "".format(schema_name=self.vertica_schema_name)
        table_list = [
            row[0]
            for row in get_vertica_results(self.vertica_credentials, query)
        ]

        for table_name in table_list:
            if not self.should_exclude_table(table_name):

                yield LoadVerticaTableToBigQuery(
                    date=self.date,
                    overwrite=self.overwrite,
                    intermediate_warehouse_path=intermediate_warehouse_path,
                    dataset_id=self.bigquery_dataset,
                    credentials=self.gcp_credentials,
                    max_bad_records=self.max_bad_records,
                    table_name=table_name,
                    vertica_schema_name=self.vertica_schema_name,
                    vertica_warehouse_name=self.vertica_warehouse_name,
                    vertica_credentials=self.vertica_credentials,
                    exclude=self.exclude,
                )
def get_vertica_results(credentials, query):
    """Run a single query in Vertica and return the results."""
    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)

    # Externalize autocommit and read timeout
    connection = vertica_python.connect(user=cred.get('username'),
                                        password=cred.get('password'),
                                        host=cred.get('host'),
                                        port=cred.get('port'),
                                        database='warehouse',
                                        autocommit=False,
                                        read_timeout=None)

    if not vertica_client_available:
        raise ImportError('Vertica client library not available')

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results
Exemplo n.º 4
0
    def requires(self):
        # The end date is not included in the result, so we have to add a day
        # to the provided date in order to ensure user registration data is
        # gathered for that date.
        end_date = self.date + timedelta(1)

        # In order to compute the cumulative sum of user registrations we need
        # all changes in registrations up to (and including) the provided date.
        registrations = UserRegistrationsPerDay(credentials=self.credentials,
                                                destination=self.destination,
                                                date_interval=Custom(
                                                    MINIMUM_DATE, end_date))

        results = {
            'enrollments':
            CourseEnrollmentChangesPerDay(
                name=self.name,
                src=self.src,
                dest=self.destination,
                include=self.include,
                manifest=self.manifest,
                mapreduce_engine=self.mapreduce_engine,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks),
            'registrations':
            registrations
        }
        if self.offsets:
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.history:
            results.update({'history': ExternalURL(self.history)})
        if self.blacklist:
            results.update({'blacklist': ExternalURL(self.blacklist)})

        return results
Exemplo n.º 5
0
    def requires(self):
        if self.required_tasks is None:
            self.required_tasks = {
                'credentials': ExternalURL(url=self.credentials),
                'source_script': ExternalURL(url=self.source_script),
            }

            if self.depends_on is not None:
                self.required_tasks['depends_on'] = self.depends_on

        return self.required_tasks
Exemplo n.º 6
0
 def requires_hadoop(self):
     # Check first if running locally with Sqoop output.
     target = get_target_from_url(self.source_dir)
     if isinstance(target, luigi.LocalTarget) and os.path.isdir(
             self.source_dir):
         files = [
             f for f in os.listdir(self.source_dir) if f.startswith("part")
         ]
         for filename in files:
             yield ExternalURL(url_path_join(self.source_dir, filename))
     else:
         yield ExternalURL(self.source_dir)
    def __init__(self, *args, **kwargs):
        super(ImportMysqlDatabaseToBigQueryDatasetTask,
              self).__init__(*args, **kwargs)
        self.table_list = []
        self.is_complete = False
        self.required_tasks = None

        # If we are overwriting the database output, then delete the entire marker table.
        # That way, any future activity on it should only consist of inserts, rather than any deletes
        # of existing marker entries.  There are quotas on deletes and upserts on a table, of no more
        # than 96 per day.   This allows us to work around hitting those limits.
        # Note that we have to do this early, before scheduling begins, so that no entries are present
        # when scheduling occurs (so everything gets properly scheduled).
        if self.overwrite:
            # First, create a BigQueryTarget object, so we can connect to BigQuery.  This is only
            # for the purpose of deleting the marker table, so use dummy values.
            credentials_target = ExternalURL(url=self.credentials).output()
            target = BigQueryTarget(
                credentials_target=credentials_target,
                dataset_id=self.dataset_id,
                table="dummy_table",
                update_id="dummy_id",
            )
            # Now ask it to delete the marker table completely.
            target.delete_marker_table()
 def insert_source_task(self):
     partition_path_spec = HivePartition('dt',
                                         self.date.isoformat()).path_spec
     url_with_filename = url_path_join(self.warehouse_path,
                                       "course_catalog", "subjects",
                                       partition_path_spec, "subjects.tsv")
     return ExternalURL(url=url_with_filename)
Exemplo n.º 9
0
 def requires(self):
     """Require the external config if we are not using the default one"""
     reqs = super(XBlockConfigMixin, self).requires()
     if os.path.basename(self.xblock_obfuscation_config
                         ) != self.xblock_obfuscation_config:
         reqs['xblock_config'] = ExternalURL(self.xblock_obfuscation_config)
     return reqs
Exemplo n.º 10
0
 def requires(self):
     if self.required_tasks is None:
         self.required_tasks = {
             'credentials': ExternalURL(url=self.credentials),
             'insert_source': self.insert_source_task
         }
     return self.required_tasks
    def requires(self):
        credentials_target = ExternalURL(url=self.google_credentials).output()
        gs = create_google_spreadsheet_client(credentials_target)
        for spreadsheet_key, config in self.spreadsheets_config.items():
            schema = config['schema']
            scratch_schema = config['scratch_schema']
            database = config['database']
            column_types_row = config.get('column_types_row', False)

            spreadsheet = gs.open_by_key(spreadsheet_key)
            worksheets = spreadsheet.worksheets()

            for worksheet in worksheets:
                yield LoadWorksheetToSnowflake(
                    date=self.date,

                    # Snowflake-related params.
                    credentials=self.sf_credentials,
                    run_id=self.sf_run_id,
                    sf_database=database,
                    schema=schema,
                    scratch_schema=scratch_schema,
                    warehouse=self.sf_warehouse,
                    role=self.sf_role,
                    overwrite=self.overwrite,

                    # Google-related params.
                    google_credentials=self.google_credentials,
                    spreadsheet_key=spreadsheet_key,
                    worksheet_name=worksheet.title,
                    column_types_row=column_types_row,
                )
Exemplo n.º 12
0
 def requires(self):
     paypal_interval = date_interval.Custom(self.paypal_interval_start,
                                            self.import_date)
     for date in paypal_interval:
         url = url_path_join(self.warehouse_path, 'payments',
                             'dt=' + date.isoformat(), 'paypal.tsv')
         yield ExternalURL(url=url)
 def insert_source_task(self):
     hive_table = "user_activity_by_user"
     # User activity data for each day is stored in a dated directory.
     # We want to be able to load all that data into Vertica in one go, hence we use
     # a wildcard('*') here.
     url = url_path_join(self.warehouse_path, hive_table) + '/dt=*/'
     return ExternalURL(url=url)
Exemplo n.º 14
0
    def requires_local(self):
        results = super(ObfuscateCourseEventsTask, self).requires_local()

        if os.path.basename(self.explicit_event_whitelist
                            ) != self.explicit_event_whitelist:
            results['explicit_events'] = ExternalURL(
                url=self.explicit_event_whitelist)
        return results
 def requires(self):
     return {'insert_source': LoadInternalReportingUserActivityToWarehouse(
         n_reduce_tasks=self.n_reduce_tasks,
         date=self.date,
         warehouse_path=self.warehouse_path,
         overwrite=self.overwrite,
         schema=self.schema,
         credentials=self.credentials),
         'credentials': ExternalURL(self.credentials)}
 def requires(self):
     if self._required_tasks is None:
         self._required_tasks = {
             'credentials':
             ExternalURL(url=self.vertica_credentials),
             'sqoop_dump_vertica_table_task':
             self.sqoop_dump_vertica_table_task,
         }
     return self._required_tasks
 def requires_local(self):
     """Adds geolocation_data as a local requirement."""
     result = super(GeolocationMixin, self).requires_local()
     # Default is an empty list, but assume that any real data added is done
     # so as a dict.
     if not result:
         result = {}
     result['geolocation_data'] = ExternalURL(self.geolocation_data)
     return result
Exemplo n.º 18
0
    def requires(self):
        if self.required_tasks is None:
            self.required_tasks = {
                'credentials': ExternalURL(url=self.credentials),
            }
            if not self.insert_source_task_dynamically:
                self.required_tasks['insert_source'] = self.insert_source_task

        return self.required_tasks
Exemplo n.º 19
0
    def manifest_file_list(self):
        """Write each individual path to a manifest file and yield the path to that file."""
        manifest_target = get_target_from_url(self.manifest)
        if not manifest_target.exists():
            with manifest_target.open('w') as manifest_file:
                for external_url_task in self.generate_file_list():
                    manifest_file.write(external_url_task.url + '\n')

        yield ExternalURL(self.manifest)
Exemplo n.º 20
0
    def get_downstream_task(self):
        # If no downstream task has been set, load our configuration and generate our tasks and dependency chain.
        if self.downstream_task is None:
            script_conf_target = ExternalURL(
                url=self.script_configuration).output()
            with script_conf_target.open('r') as script_conf_file:
                config = yaml.safe_load(script_conf_file)
                if config is not None and isinstance(config, dict):
                    previous_task = None

                    scripts = config.get('scripts', [])

                    # Iterate over the list of scripts in the configuration file in reverse order.  We also zip a list of integers,
                    # representing the zero-based index position of the given script in the overall list.  We iterate in reverse
                    # in order to link each task together, using requires(), to ensure that tasks run sequentially, and in the intended
                    # order: from the top of the file, downwards.
                    for script in scripts:
                        if not self.validate_script_entry(script):
                            log.warn("encountered invalid script entry!")
                            continue

                        new_task = RunVerticaSqlScriptTask(
                            credentials=self.credentials,
                            schema=self.schema,
                            marker_schema=self.marker_schema,
                            date=self.date,
                            read_timeout=self.read_timeout,
                            source_script=path.join(self.script_root,
                                                    script['location']),
                            script_name=script.get('name'))

                        # If we previously configured a task, set it as a dependency of this one, so it runs prior to.
                        if previous_task is not None:
                            new_task.add_dependency(previous_task)

                        # Mark this as the previously-created task.
                        previous_task = new_task

                    self.downstream_task = previous_task

        # If a downstream task has been set, yield it, triggering Luigi to schedule our scripts.
        if self.downstream_task is not None:
            yield self.downstream_task
Exemplo n.º 21
0
    def requires(self):
        results = {
            'source': CourseEnrollmentChangesPerDay(
                name=self.name,
                src=self.src,
                dest=self.destination,
                include=self.include,
                manifest=self.manifest,
                mapreduce_engine=self.mapreduce_engine,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks
            )
        }
        if self.offsets:
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.statuses:
            results.update({'statuses': ExternalURL(self.statuses)})

        return results
Exemplo n.º 22
0
    def output(self):
        # TODO: Once VerticaCopyTask handles multiple input files update this
        # to use the outputs of the sub-jobs instead of always returning all
        # files.

        # Affiliate Window reports for each day are stored in dated directories.
        # We want to be able to load all that data into Vertica in one go, hence we use
        # a wildcard('*') here.
        url = url_path_join(self.warehouse_path, 'fees',
                            'affiliate_window') + '/dt=*/'
        return ExternalURL(url=url).output()
Exemplo n.º 23
0
    def requires(self):
        yield ExternalURL(url=self.vertica_credentials)

        yield ExternalURL(url=self.gcp_credentials)

        if self.bigquery_dataset is None:
            self.bigquery_dataset = self.vertica_schema_name

        for table_name in self.get_table_list_for_schema():
            yield LoadVerticaTableFromS3ToBigQueryTask(
                date=self.date,
                overwrite=self.overwrite,
                intermediate_warehouse_path=self.intermediate_warehouse_path,
                dataset_id=self.bigquery_dataset,
                credentials=self.gcp_credentials,
                max_bad_records=self.max_bad_records,
                table_name=table_name,
                vertica_schema_name=self.vertica_schema_name,
                vertica_warehouse_name=self.vertica_warehouse_name,
                vertica_credentials=self.vertica_credentials,
            )
 def requires(self):
     return {
         'source':
         LoadWarehouseTask(date=self.date,
                           schema=self.schema,
                           credentials=self.credentials,
                           marker_schema=self.marker_schema,
                           overwrite=self.overwrite,
                           n_reduce_tasks=self.n_reduce_tasks),
         'credentials':
         ExternalURL(self.credentials)
     }
Exemplo n.º 25
0
def get_vertica_results(credentials, query):
    """Run a single query in Vertica and return the results."""
    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)

    # Externalize autocommit and read timeout
    connection = vertica_python.connect(user=cred.get('username'), password=cred.get('password'), host=cred.get('host'),
                                        port=cred.get('port'), database='warehouse', autocommit=False,
                                        read_timeout=None)

    if not vertica_client_available:
        raise ImportError('Vertica client library not available')

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results
    def get_downstream_task(self):
        # If no downstream task has been set, load our configuration and generate our tasks and dependency chain.
        if self.downstream_task is None:
            script_conf_target = ExternalURL(url=self.script_configuration).output()
            with script_conf_target.open('r') as script_conf_file:
                config = yaml.safe_load(script_conf_file)
                if config is not None and isinstance(config, dict):
                    previous_task = None

                    scripts = config.get('scripts', [])

                    # Iterate over the list of scripts in the configuration file in reverse order.  We also zip a list of integers,
                    # representing the zero-based index position of the given script in the overall list.  We iterate in reverse
                    # in order to link each task together, using requires(), to ensure that tasks run sequentially, and in the intended
                    # order: from the top of the file, downwards.
                    for script in scripts:
                        if not self.validate_script_entry(script):
                            log.warn("encountered invalid script entry!")
                            continue

                        new_task = RunVerticaSqlScriptTask(
                            credentials=self.credentials, schema=self.schema, marker_schema=self.marker_schema,
                            date=self.date, read_timeout=self.read_timeout, source_script=path.join(self.script_root, script['location']),
                            script_name=script.get('name'))

                        # If we previously configured a task, set it as a dependency of this one, so it runs prior to.
                        if previous_task is not None:
                            new_task.add_dependency(previous_task)

                        # Mark this as the previously-created task.
                        previous_task = new_task

                    self.downstream_task = previous_task

        # If a downstream task has been set, yield it, triggering Luigi to schedule our scripts.
        if self.downstream_task is not None:
            yield self.downstream_task
Exemplo n.º 27
0
def get_mysql_query_results(credentials, database, query):
    """
    Executes a mysql query on the provided database and returns the results.
    """

    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)

    connection = mysql.connector.connect(user=cred.get('username'),
                                         password=cred.get('password'),
                                         host=cred.get('host'),
                                         port=cred.get('port'),
                                         database=database)

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results
Exemplo n.º 28
0
    def insert_source_task(self):
        """
        We are already exporting vertica tables to S3 using SqoopImportFromVertica through VerticaSchemaToBigQueryTask
        workflow, so we specify ExternalURL here instead. In the future we can change this to a
        SqoopImportFromVertica task.
        """
        partition_path_spec = HivePartition('dt', self.date).path_spec
        intermediate_warehouse_path = url_path_join(self.warehouse_path,
                                                    'import/vertica/sqoop/')
        url = url_path_join(intermediate_warehouse_path,
                            self.vertica_warehouse_name,
                            self.vertica_schema_name, self.table_name,
                            partition_path_spec) + '/'

        return ExternalURL(url=url)
Exemplo n.º 29
0
def get_mysql_query_results(credentials, database, query):
    """
    Executes a mysql query on the provided database and returns the results.
    """

    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)

    connection = mysql.connector.connect(user=cred.get('username'),
                                         password=cred.get('password'),
                                         host=cred.get('host'),
                                         port=cred.get('port'),
                                         database=database)

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results
    def requires(self):
        yield ExternalURL(url=self.vertica_credentials)

        for table_name in self.get_table_list_for_schema():
            yield ExportVerticaTableToS3Task(
                date=self.date,
                overwrite=self.overwrite,
                table_name=table_name,
                intermediate_warehouse_path=self.intermediate_warehouse_path,
                vertica_schema_name=self.vertica_schema_name,
                vertica_warehouse_name=self.vertica_warehouse_name,
                vertica_credentials=self.vertica_credentials,
                sqoop_null_string=self.sqoop_null_string,
                sqoop_fields_terminated_by=self.sqoop_fields_terminated_by,
                sqoop_delimiter_replacement=self.sqoop_delimiter_replacement,
            )
Exemplo n.º 31
0
    def requires(self):
        config = get_config()
        for merchant_id in self.cybersource_merchant_ids:
            section_name = 'cybersource:' + merchant_id
            interval_start = luigi.DateParameter().parse(config.get(section_name, 'interval_start'))
            interval_end = self.import_date

            merchant_close_date = config.get(section_name, 'merchant_close_date', '')
            if merchant_close_date:
                parsed_date = luigi.DateParameter().parse(merchant_close_date)
                interval_end = min(self.import_date, parsed_date)

            cybersource_interval = date_interval.Custom(interval_start, interval_end)

            for date in cybersource_interval:
                filename = "cybersource_{}.tsv".format(merchant_id)
                url = url_path_join(self.warehouse_path, 'payments', 'dt=' + date.isoformat(), filename)
                yield ExternalURL(url=url)
Exemplo n.º 32
0
    def requires(self):
        results = {
            'events': ProblemCheckEvent(
                mapreduce_engine=self.mapreduce_engine,
                input_format=self.base_input_format,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks,
                name=self.name,
                src=self.src,
                dest=self.dest,
                include=self.include,
                manifest=self.manifest,
            ),
        }

        if self.answer_metadata:
            results.update({'answer_metadata': ExternalURL(self.answer_metadata)})

        return results
    def requires(self):
        credentials_target = ExternalURL(url=self.google_credentials).output()
        gs = create_google_spreadsheet_client(credentials_target)
        for spreadsheet_key, config in self.spreadsheets_config.items():
            schema = config['schema']
            column_types_row = config.get('column_types_row', False)

            spreadsheet = gs.open_by_key(spreadsheet_key)
            worksheets = spreadsheet.worksheets()

            for worksheet in worksheets:
                yield LoadWorksheetToVertica(
                    date=self.date,
                    schema=schema,
                    google_credentials=self.google_credentials,
                    spreadsheet_key=spreadsheet_key,
                    worksheet_name=worksheet.title,
                    column_types_row=column_types_row,
                    overwrite=self.overwrite,
                )