Exemplo n.º 1
0
    def consume(self, **kwargs) -> dict:
        """
        Iterates over the source directory and processes files.
        Args:
            **kwargs
        """
        # TODO: Currently assuming directory only has CSV files
        # TODO: Add support for other files types later
        for root, dirs, files in os.walk(self.__source_path):
            for file in set(files).difference(
                    set(chain(*self._processed_history_list))):
                self._logger.debug("Yielding file: %s", file)
                to_process = str(os.path.join(root, file))
                self._logger.info("Processing %s...", to_process)

                if not bool(os.path.isfile(to_process)):
                    continue

                self._entity = file
                self._entity_filter = None

                kwargs['file'] = self._entity
                kwargs['path'] = root
                kwargs['table_name'] = GenericFunctions.folder_to_table(
                    root.split(self.__source_path)[1][1:])

                try:
                    self._correlation_id_in = file.split(
                        f'{ProjectConfig.file_prefix()}_', 1)[1][0:-4]
                except Exception:
                    self._correlation_id_in = None

                yield self._run(**kwargs)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.__dest_path = os.path.abspath(kwargs.get('directory'))
        if not os.path.exists(self.__dest_path):
            raise NotADirectoryError("Unable to create Sink landing at %s" % self.__dest_path)
        # TODO - More file types for later
        self.__chunk = kwargs.get('chunk', None)
        self.__checksum = kwargs.get('checksum_method', None)
        self.__file_format = kwargs.get('file_format', 'csv')
        self.__connection_choice = kwargs['env']
        self.__table = kwargs['table_name']
        self.__watermark = kwargs.get('watermark', [])
        self.__unload_file_name = ""
        self.__external_table_name = f"{self.__table}_{ProjectConfig.file_prefix().upper()}_EXT"
        self.__drop_query = f"DROP TABLE {self.__external_table_name} IF EXISTS;"
        self.__query = ""
        self._entity = self.__table
        self._entity_filter = self.__watermark
        self._correlation_id_out = uuid.uuid4().hex
        if self.__file_format == 'csv':
            self.__unload_file_name = kwargs.get("file_name", f"{ProjectConfig.file_prefix()}"
                                                              f"_{self._correlation_id_out}.csv")
        else:
            raise NotImplementedError("Unknown output type: %s" % self.__file_format)
        self._sink_entity = self.__unload_file_name

        file_path = os.path.abspath(os.path.join(self.__dest_path, self._sink_name, GenericFunctions.table_to_folder(self.__table)))
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        self.__unload_file = os.path.abspath(os.path.join(file_path, self.__unload_file_name))
        self.__driver_type = ""
        self.__external_table_query = ""
Exemplo n.º 3
0
    def _set_data(self, **kwargs) -> dict:
        df = kwargs['data_frame']
        table_name: str = kwargs.get("table_name")

        self._entity = kwargs.get(
            'file_name',
            f"{ProjectConfig.file_prefix()}_{str(time.time_ns())}.csv")
        self._entity_filter = os.path.join(
            self._sink_name, GenericFunctions.table_to_folder(table_name))
        with self.__azure_dao.connection as client:
            self.__container = client.get_container_client(
                self.__container_name)
            blob_client = self.__container.get_blob_client(
                os.path.join(self._sink_name,
                             GenericFunctions.table_to_folder(table_name),
                             self._entity))
            self.__putFile(df, blob_client)
        return dict(record_count=df.shape[0])
Exemplo n.º 4
0
    def __chunk_file(self, file_name, root, sink_name, table_name) -> list:
        """
        chunk csv file type
        Return list
        Args:
            file_name: file name to be chunked
        """
        chunk_file_lst = []
        try:
            to_process = os.path.abspath(os.path.join(root, file_name))

            with open(to_process, 'r') as file:
                csv_file = file.readlines()
            header = csv_file[0]
            csv_file.pop(0)
            idx = 0

            target_directory = os.path.join(self.__source_base_path, sink_name, GenericFunctions.table_to_folder(table_name))
            if not os.path.exists(target_directory):
                # print('Folder not found. Creating one')
                os.makedirs(target_directory)

            if len(csv_file) > self.__chunk:
                for j in range(len(csv_file)):
                    if j % self.__chunk == 0:
                        write_file = csv_file[j:j + self.__chunk]
                        write_file.insert(0, header)
                        try:
                            chunked_file_name = f"{ProjectConfig.file_prefix()}_{uuid.uuid4().hex}.csv"
                            with open(os.path.abspath(os.path.join(target_directory,
                                                                   chunked_file_name)), 'w+') as file:
                                file.writelines(write_file)
                            idx += 1

                            chunk_file_lst.append(chunked_file_name)
                        except Exception as e:
                            self._logger.error("Got exception: %s while creating %s", e, chunked_file_name)
            else:
                # still copy the file to target_folder, as next stage will pick file from sink_name folder
                _ = shutil.copy(to_process, target_directory)

            return chunk_file_lst

        except Exception as e:
            self._logger.error("Got exception: %s in chunking %s", e, to_process)
Exemplo n.º 5
0
    def __write_file(self, file_name, df, table_name):
        """
        saves files to sink
        Returns: none
        """
        if table_name:
            destination_directory = os.path.join(
                self.__dest_path, self._sink_name,
                GenericFunctions.table_to_folder(table_name))
        else:
            destination_directory = self.__dest_path

        if not os.path.exists(destination_directory):
            os.makedirs(destination_directory)
        self._entity_filter = destination_directory
        csv_to_create = os.path.abspath(
            os.path.join(destination_directory, file_name))
        self._logger.info("Writing file: %s" % csv_to_create)
        df.to_csv(csv_to_create, index=False)
Exemplo n.º 6
0
    def consume(self, **kwargs) -> dict:
        """
        Iterates over the source directory and processes files.
        Args:
            **kwargs
        """
        # TODO: Currently assuming directory only has CSV files
        # TODO: Add support for other files types later
        for root, dirs, files in os.walk(self.__source_path):
            for file in set(files).difference(set(chain(*self._processed_history_list))):
                self._logger.debug("Yielding file: %s", file)
                to_process = str(os.path.join(root, file))
                self._logger.info("Processing %s...", to_process)

                if not bool(os.path.isfile(to_process)):
                    continue

                # Get table name from file path
                table_name = GenericFunctions.folder_to_table(root.split(self.__source_path)[1][1:])

                # Loop through each chunk file and generate dataframes for fs_sink to consume
                cnt = 0
                for chunked_file in self.__chunk_file(file, root, self._sink_name, table_name):
                    cnt += 1
                    self._entity = file
                    if self.__chunk:
                        self._entity_filter = [{'chunk': f'{self.__chunk}', 'seq': f'{cnt}'}]

                    kwargs['file'] = chunked_file
                    kwargs['source_file'] = self._entity
                    kwargs['source_filter'] = self._entity_filter
                    kwargs['table_name'] = table_name
                    kwargs['path'] = os.path.join(self.__source_base_path, self._sink_name, GenericFunctions.table_to_folder(table_name))

                    try:
                        self._correlation_id_in = file.split(f'{ProjectConfig.file_prefix()}_', 1)[1][0:-4]
                        self._correlation_id_out = chunked_file.split(f'{ProjectConfig.file_prefix()}_', 1)[1][0:-4]
                    except Exception:
                        self._correlation_id_in = None
                        self._correlation_id_out = None

                    yield self._run(**kwargs)
Exemplo n.º 7
0
    def consume(self, **kwargs) -> dict:
        with self.__azure_dao.connection as client:
            self.__container = client.get_container_client(
                self.__container_name)
            for blob_prop in self.__container.list_blobs():
                self._entity = blob_prop.name.split(self._source_name +
                                                    "/")[1].split("/")[1]
                self._entity_filter = None

                kwargs['file'] = self._entity
                kwargs['path'] = blob_prop
                kwargs['table_name'] = GenericFunctions.folder_to_table(
                    blob_prop.name.split(self._source_name +
                                         "/")[1].split("/")[0])
                try:
                    self._correlation_id_in = blob_prop.name.split(
                        self._source_name + "/")[1].split("/")[1].split(
                            f'{ProjectConfig.file_prefix()}_', 1)[1][0:-4]
                except Exception:
                    self._correlation_id_in = None

                yield self._run(**kwargs)