def run(self):
        """Main pipeline run method

        One of the main features is that the connector, extractor,
        schema, and loader are all instantiated here as opposed to
        when they are declared on pipeline instantiation. This delays
        opening connections until the last possible moment.

        The run method works essentially as follow:

        1. Run the ``pre_run`` method, which gives us the pipeline
           start time, ensures that our pipeline has all of the
           required component pieces, and connects to the status db.
        2. Boot up a new connection object, and get the checksum
           of the connected iterable.
        3. Check to make sure that the incoming checksum is different
           from the previous run's input_checksum
        4. Instantiate our schema
        5. Iterate through the iterable returned from the connector's
           connect method, handling each element with the extractor's
           ``handle_line`` method before passing it to the the
           ``load_line`` method to attach each row to the pipeline's
           data.
        6. After iteration, clean up the connector
        7. Instantiate the loader and load the data
        8. Finally, update the status to successful run and close
           down and clean up the pipeline.
        """
        try:
            start_time = self.pre_run()

            # instantiate a new connection based on the
            # passed connector class
            _connector = self._connector(self.connector_config, *(self.connector_args), **(self.connector_kwargs))

            input_checksum = _connector.checksum_contents(self.target)
            if input_checksum == self.get_last_run_checksum():
                raise DuplicateFileException

            if self.log_status:
                self.status = Status(self.conn, self.name, self.display_name, None, start_time, "new", None, None, None)

            # log the status
            if self.log_status:
                self.status.write()

            # TODO: this is called when running checksum
            connection = _connector.connect(self.target)

            # instantiate a new extrator instance based on
            # the passed extract class
            _extractor = self._extractor(connection, *(self.extractor_args), **(self.extractor_kwargs))

            # instantiate our schema
            self.__schema = self._schema()

            # build the data
            raw = _extractor.process_connection()

            try:
                for line in raw:
                    try:
                        data = _extractor.handle_line(line)
                        self.load_line(data)
                    except IsHeaderException:
                        continue
            finally:
                _connector.close()

            # load the data
            _loader = self._loader(self.loader_config, *(self.loader_args), **(self.loader_kwargs))
            _loader.load(self.data)

            if self.log_status:
                self.status.update(status="success", input_checksum=input_checksum)

        except Exception as e:
            if self.log_status and hasattr(self, "status"):
                self.status.update(status="error: {}".format(str(e)))
            raise

        finally:
            if self.log_status and hasattr(self, "status"):
                self.status.update(num_lines=len(self.data), last_ran=time.time())
            self.close()

        return self
class Pipeline(object):
    """Main pipeline class

    The pipeline class binds together extractors, schema,
    and loaders and runs everything together. Almost all
    Pipeline methods return the pipeline object, allowing
    for methods to be chained together.
    """

    def __init__(self, name, display_name, settings_file=None, log_status=True, conn=None):
        """
        Arguments:
            name: pipeline's name, passed to
                :py:class:`~purchasing.status.Status`
            display_name: display name, passed to
                :py:class:`~purchasing.status.Status`

        Keyword Arguments:
            settings_file: filepath to the configuration file
            log_status: boolean for whether or not to log the status
                of the pipeline. useful to turn off for testing
            conn: optionally passed sqlite3 connection object. if no
                connection is passed, one will be instantiated when the
                pipeline's ``run`` method is called
        """
        self.data = []
        self._connector, self._extractor, self._schema, self._loader = None, None, None, None
        self.name = name
        self.display_name = display_name

        settings_file = settings_file if settings_file else os.path.join(PARENT, "settings.json")
        self.set_config_from_file(settings_file)
        self.log_status = log_status

        if conn:
            self.conn = conn
            self.passed_conn = True
        else:
            self.passed_conn = False

    def get_config(self):
        return self.config

    def set_config_from_file(self, file):
        """Sets the pipeline's configuration from file

        Arguments:
            file: Location of the configuration to load

        Raises:
            InvalidConfigException: if configuration is found or
            the found configuration is not valid json
        """
        try:
            with open(file) as f:
                self.config = json.loads(f.read())

        except (KeyError, IOError, FileNotFoundError):
            raise InvalidConfigException("No config file found, or config not properly formatted")

    def parse_config_piece(self, pipeline_piece, config_piece_string):
        """Parse out a small piece of the overall pipeline configuration

        This is used to pass only the relevant parts of configuration
        to the relevant loaders. The structure allows configuration to
        grow larger without forcing implementing functions to know exactly
        how the configuration must be structured.

        Arguments:
            pipeline_piece: which part of the pipeline to use (for example,
                'loader'). This should not be modified by the user.
            config_piece_string: passed by the user, allows accessing
                a deeper nested part of the configuration

        Returns:
            Isolated configuration only for the specified piece

        Raises:
            InvalidConfigException when the specified configuration
            piece cannot be found
        """
        config_piece = self.config[pipeline_piece]

        if config_piece_string:
            for piece in config_piece_string.split("."):
                try:
                    config_piece = config_piece[piece]
                except KeyError:
                    raise InvalidConfigException

        return config_piece

    def connect(self, connector, target, config_string=None, *args, **kwargs):
        self._connector = connector
        self.connector_config = self.parse_config_piece("connector", config_string)
        self.target = target
        self.connector_args = list(args)
        self.connector_kwargs = dict(**kwargs)
        return self

    def extract(self, extractor, *args, **kwargs):
        """Set the extractor class and related arguments

        Arguments:
            extractor: Extractor class, see :ref:`built-in-extractors`
            target: location of the extraction target (file, url, etc.)
        """
        self._extractor = extractor
        self.extractor_args = list(args)
        self.extractor_kwargs = dict(**kwargs)
        return self

    def schema(self, schema):
        """Set the schema class

        Arguments:
            schema: Schema class

        Returns:
            modified Pipeline object
        """
        self._schema = schema
        return self

    def load(self, loader, config_string=None, *args, **kwargs):
        """Sets the loader class

        Arguments:
            loader: Loader class. See :ref:`built-in-loaders`

        Returns:
            modified Pipeline object
        """
        self._loader = loader
        self.loader_config = self.parse_config_piece("loader", config_string)
        self.loader_args = list(args)
        self.loader_kwargs = dict(**kwargs)
        return self

    def load_line(self, data):
        """Load a line into the pipeline's data or throw an error

        Arguments:
            data: A parsed line from an extractor's handle_line
                method
        """
        loaded = self.__schema.load(data)
        if loaded.errors:
            raise RuntimeError(
                "There were errors in the input data: {} (passed data: {})".format(loaded.errors.__str__(), data)
            )
        else:
            self.data.append(self.__schema.dump(loaded.data).data)

    def enforce_full_pipeline(self):
        """Ensure that a pipeline has an extractor, schema, and loader

        Raises:
            RuntimeError: if an extractor, schema, and loader are not
                all specified
        """
        if not all([self._connector, self._extractor, self._schema, self._loader]):
            raise RuntimeError("You must specify connect, extract, schema, and load steps!")

    def get_last_run_checksum(self):
        if self.log_status:
            result = self.conn.execute(
                """
                SELECT input_checksum, max(last_ran)
                FROM status
                WHERE name = ?
                AND display_name = ?
                GROUP BY input_checksum
            """,
                (self.name, self.display_name),
            ).fetchone()
            if result:
                return result[0]
        return None

    def pre_run(self):
        """Method to be run immediately before the pipeline runs

        Enforces that a pipeline is complete and, connects to the statusdb

        Returns:
            A unix timestamp of the pipeline's start time.
        """
        start_time = time.time()

        self.enforce_full_pipeline()

        if not self.passed_conn:
            self.conn = sqlite3.Connection(self.config["general"]["statusdb"])

        return start_time

    def run(self):
        """Main pipeline run method

        One of the main features is that the connector, extractor,
        schema, and loader are all instantiated here as opposed to
        when they are declared on pipeline instantiation. This delays
        opening connections until the last possible moment.

        The run method works essentially as follow:

        1. Run the ``pre_run`` method, which gives us the pipeline
           start time, ensures that our pipeline has all of the
           required component pieces, and connects to the status db.
        2. Boot up a new connection object, and get the checksum
           of the connected iterable.
        3. Check to make sure that the incoming checksum is different
           from the previous run's input_checksum
        4. Instantiate our schema
        5. Iterate through the iterable returned from the connector's
           connect method, handling each element with the extractor's
           ``handle_line`` method before passing it to the the
           ``load_line`` method to attach each row to the pipeline's
           data.
        6. After iteration, clean up the connector
        7. Instantiate the loader and load the data
        8. Finally, update the status to successful run and close
           down and clean up the pipeline.
        """
        try:
            start_time = self.pre_run()

            # instantiate a new connection based on the
            # passed connector class
            _connector = self._connector(self.connector_config, *(self.connector_args), **(self.connector_kwargs))

            input_checksum = _connector.checksum_contents(self.target)
            if input_checksum == self.get_last_run_checksum():
                raise DuplicateFileException

            if self.log_status:
                self.status = Status(self.conn, self.name, self.display_name, None, start_time, "new", None, None, None)

            # log the status
            if self.log_status:
                self.status.write()

            # TODO: this is called when running checksum
            connection = _connector.connect(self.target)

            # instantiate a new extrator instance based on
            # the passed extract class
            _extractor = self._extractor(connection, *(self.extractor_args), **(self.extractor_kwargs))

            # instantiate our schema
            self.__schema = self._schema()

            # build the data
            raw = _extractor.process_connection()

            try:
                for line in raw:
                    try:
                        data = _extractor.handle_line(line)
                        self.load_line(data)
                    except IsHeaderException:
                        continue
            finally:
                _connector.close()

            # load the data
            _loader = self._loader(self.loader_config, *(self.loader_args), **(self.loader_kwargs))
            _loader.load(self.data)

            if self.log_status:
                self.status.update(status="success", input_checksum=input_checksum)

        except Exception as e:
            if self.log_status and hasattr(self, "status"):
                self.status.update(status="error: {}".format(str(e)))
            raise

        finally:
            if self.log_status and hasattr(self, "status"):
                self.status.update(num_lines=len(self.data), last_ran=time.time())
            self.close()

        return self

    def close(self):
        """Close any open database connections.
        """
        if not self.passed_conn and hasattr(self, "conn"):
            self.conn.close()
        self.__schema = None