Exemplo n.º 1
0
    def run(self) -> bool:
        """
        Runs the command

        Returns:
            False on failure
        """
        from .. import shell
        shell_command = self.shell_command()

        # create temp tap config file
        if self._tap_config:
            tmp_tap_config_path = pathlib.Path(
                config.config_dir()) / f'{self.config_file_name}.tmp'
            tap_config = self.tap_config
            with open(tmp_tap_config_path, 'w') as tap_config_file:
                json.dump(tap_config, tap_config_file)
        elif not os.path.exists(self.config_file_path()):
            log(message=
                f"The tap config '{self.config_file_path()}' does not exist.",
                is_error=True)
            return False

        try:
            result = shell.singer_run_shell_command(shell_command)
        finally:
            if self._tap_config:
                os.remove(tmp_tap_config_path)

        return result
Exemplo n.º 2
0
 def _pre_run(self) -> bool:
     if not os.path.exists(self.destination_path()):
         log(message=
             f"The destination path '{self.destination_path()}' does not exist.",
             is_error=True)
         return False
     return True
Exemplo n.º 3
0
    def run(self) -> bool:
        """
        Runs the command

        Returns:
            False on failure
        """

        # load schema
        schema = self.get_schema()

        # load sql file
        sql_query = self.get_sql_query()

        # query data frame from db
        logger.log(f'Read data from SQL', format=logger.Format.ITALICS)
        df = read_dataframe(self.db_alias, sql_query)

        # write avro file
        avro_file_path = f"{pathlib.Path(config.data_dir()) / self.file_name}"
        logger.log(f'Write to AVRO file {avro_file_path}',
                   format=logger.Format.ITALICS)
        pdx.to_avro(avro_file_path, df, schema=schema)

        return True
Exemplo n.º 4
0
 def get_schema(self):
     if self.schema_file_name:
         schema_file_path = str(self.schema_file_path().absolute())
         logger.log(f'Load AVRO schema from file {schema_file_path}',
                    format=logger.Format.ITALICS)
         with open(schema_file_path, 'r') as f:
             return json.load(f)
     return self.schema
Exemplo n.º 5
0
    def run(self) -> bool:
        # create temp catalog (if necessary)
        tmp_catalog_file_path = None
        if self.stream_selection:
            tmp_catalog_file_path = self.catalog_file_path()
            catalog = SingerCatalog(self.catalog_file_name)
            has_error = False
            if isinstance(self.stream_selection, list):
                for stream_name in self.stream_selection:
                    if stream_name in catalog.streams:
                        catalog.streams[stream_name].mark_as_selected()
                    else:
                        log(message=
                            f"Could not find stream '{stream_name}' in catalog for selection",
                            is_error=True)
                        has_error = True
            elif isinstance(self.stream_selection, dict):
                for stream_name, properties in self.stream_selection.items():
                    if stream_name in catalog.streams:
                        catalog.streams[stream_name].mark_as_selected(
                            properties=properties)
                    else:
                        log(message=
                            f"Could not find stream '{stream_name}' in catalog for selection",
                            is_error=True)
                        has_error = True
            else:
                raise Exception(
                    f'Unexpected type of stream_selection: {self.stream_selection.__class__.__name__}'
                )

            if has_error:
                return False

            catalog.save(tmp_catalog_file_path)

        # create temp target config file
        target_config = {}
        self._create_target_config(target_config)
        tmp_target_config_path = self._target_config_path()
        with open(tmp_target_config_path, 'w') as target_config_file:
            json.dump(target_config, target_config_file)

        # run command
        try:
            # run pre-checks before calling run
            if not self._pre_run():
                return False

            # execute shell command
            if not super().run():
                return False
        finally:
            if self.stream_selection:
                os.remove(tmp_catalog_file_path)
            os.remove(tmp_target_config_path)

        return True
Exemplo n.º 6
0
    def run(self):
        self._has_error = False

        for line in self.process.stderr:
            pos = line.find(' ')
            if pos == -1:
                loglevel = 'NOTSET'
                logmsg = line
            else:
                loglevel = line[:pos]
                logmsg = line[(pos+1):]

            if loglevel == 'INFO':
                if logmsg.startswith('METRIC:'):
                    # This data could be used for showing execution statistics; see also https://github.com/singer-io/getting-started/blob/96a0f7addec517fcf5155284744c648fe4f16902/docs/SYNC_MODE.md#metric-messages
                    logger.log(logmsg, format=logger.Format.ITALICS)
                else:
                    logger.log(logmsg, format=logger.Format.VERBATIM)

            elif loglevel in ['NOTSET','WARNING']:
                logger.log(logmsg, format=logger.Format.VERBATIM)
            elif loglevel == 'DEBUG':
                pass # DEBUG messages are ignored
            elif loglevel in ['ERROR','CRITICAL']:
                self._has_error = True
                logger.log(logmsg, format=logger.Format.VERBATIM, is_error=True)
Exemplo n.º 7
0
def write_mondrian_schema():
    import mara_mondrian.schema_generation
    file_name = pathlib.Path('.mondrian-schema.xml')
    logger.log(f'Writing {file_name}', logger.Format.ITALICS)

    mara_mondrian.schema_generation.write_mondrian_schema(
        file_name=pathlib.Path('.mondrian-schema.xml'),
        data_set_tables={
            data_set: ('mondrian', data_set.id())
            for data_set in mara_schema.config.data_sets()
        },
        personal_data=False,
        high_cardinality_attributes=False)

    return True
Exemplo n.º 8
0
    def get_sql_query(self):
        sql_query = None

        if self.sql_file_path:
            sql_query_file_path = str(self.sql_file_path().absolute())
            logger.log(f'Read SQL query from file {sql_query_file_path}',
                       format=logger.Format.ITALICS)
            with open(sql_query_file_path, 'r') as f:
                sql_query = f.read()
        if self.sql_query:
            sql_query = self.sql_query

        if self.replace:
            for key, value in self.replace:
                sql_query = sql_query.replace(key, value)

        return sql_query
Exemplo n.º 9
0
 def run(self):
     from mara_pipelines import shell
     from mara_pipelines.logging import logger
     pipeline_base_directory = self.parent.parent.base_path()
     excludes = ' --exclude=__init__.py --exclude=\*.md --exclude=\*.pyc'
     # cd'ing && grepping in . allows us to show short filenames
     # The "(...) || true" will ensure that we do not get any output if nothing is found
     shell_command = f'(cd "{pipeline_base_directory}" && egrep --recursive {excludes} "{self.pattern}" .) || true'
     lines_or_bool = shell.run_shell_command(shell_command)
     if lines_or_bool is True:
         return True
     else:
         # The || true makes sure we will not get any False
         logger.log(f"Please don\'t use the pattern '{self.pattern}' in this pipeline. Matching lines:",
                    format=logger.Format.ITALICS)
         lines = '\n'.join(lines_or_bool)
         logger.log(f"{lines}", format=logger.Format.ITALICS)
         return False
 def run(self) -> bool:
     logger.log(
         f'Loading google analytics data {self.view_id} ({self.dimensions} {self.metrics}) into {self.target_db_alias}.{self.target_table_name}...')
     if not super().run():
         logger.log(f'Error while loading google analytics data.')
         return False
     logger.log(f'Finished loading google analytics data.')
     return True
 def run(self) -> bool:
     logger.log(
         f'Loading google sheet {self.spreadsheet_key} into {self.target_db_alias}.{self.target_table_name}...'
     )
     if not super().run():
         logger.log(
             f'Error while loading google sheet {self.spreadsheet_key}.')
         return False
     logger.log(f'Finished loading google sheet {self.spreadsheet_key}.')
     return True
Exemplo n.º 12
0
 def read_process_stdout():
     for line in process.stdout:
         output_lines.append(line)
         logger.log(line, format=logger.Format.VERBATIM)
Exemplo n.º 13
0
def singer_run_shell_command(command: str, log_command: bool = True):
    """
    Runs a command in a bash shell and logs the output of the command in (near)real-time according to the
    singer specification: https://github.com/singer-io/getting-started/blob/master/docs/SPEC.md#output

    Args:
        command: The command to run
        log_command: When true, then the command itself is logged before execution

    Returns:
        Either (in order)
        - False when the exit code of the command was not 0
        - True when there was no output to stdout
        - The output to stdout, as an array of lines
    """
    import shlex, subprocess, threading

    if log_command:
        logger.log(command, format=logger.Format.ITALICS)

    process = subprocess.Popen(shlex.split(config.bash_command_string()) +
                               ['-c', command],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               universal_newlines=True)

    # keep stdout output
    output_lines = []

    # unfortunately, only file descriptors and the system stream can be passed to
    # subprocess.Popen(..) (and not custom streams without a file handle).
    # So in order to see be able to log the output in real-time, we have to
    # query the output steams of the process from to separate threads
    def read_process_stdout():
        for line in process.stdout:
            output_lines.append(line)
            logger.log(line, format=logger.Format.VERBATIM)

    read_stdout_thread = threading.Thread(target=read_process_stdout)
    read_stdout_thread.start()
    read_singertaplog_thread = SingerTapReadLogThread(process=process)
    read_singertaplog_thread.start()

    # wait until the process finishes
    while process.poll() is None:
        time.sleep(0.005)

    read_stdout_thread.join()
    read_singertaplog_thread.join()

    if read_singertaplog_thread.has_error:
        logger.log('Singer tap error occured',
                   is_error=True,
                   format=logger.Format.ITALICS)
        return False

    exitcode = process.returncode
    if exitcode != 0:
        logger.log(f'exit code {exitcode}',
                   is_error=True,
                   format=logger.Format.ITALICS)
        return False

    return output_lines or True