示例#1
0
    def _open_gcs_url(self, binary) -> object:
        mode = "rb" if binary else "r"
        service_account_json = self._provider.get("service_account_json")
        credentials = None
        if service_account_json:
            try:
                credentials = json.loads(
                    self._provider["service_account_json"])
            except json.decoder.JSONDecodeError as err:
                error_msg = f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}"
                logger.error(error_msg)
                raise ConfigurationError(error_msg) from err

        if credentials:
            credentials = service_account.Credentials.from_service_account_info(
                credentials)
            client = GCSClient(credentials=credentials,
                               project=credentials._project_id)
        else:
            client = GCSClient.create_anonymous_client()
        file_to_close = smart_open.open(self.full_url,
                                        transport_params=dict(client=client),
                                        mode=mode)

        return file_to_close
示例#2
0
    def storage_scheme(self) -> str:
        """Convert Storage Names to the proper URL Prefix
        :return: the corresponding URL prefix / scheme
        """
        storage_name = self._provider["storage"].upper()
        parse_result = urlparse(self._url)
        if storage_name == "GCS":
            return "gs://"
        elif storage_name == "S3":
            return "s3://"
        elif storage_name == "AZBLOB":
            return "azure://"
        elif storage_name == "HTTPS":
            return "https://"
        elif storage_name == "SSH" or storage_name == "SCP":
            return "scp://"
        elif storage_name == "SFTP":
            return "sftp://"
        elif storage_name == "WEBHDFS":
            return "webhdfs://"
        elif storage_name == "LOCAL":
            return "file://"
        elif parse_result.scheme:
            return parse_result.scheme

        logger.error(f"Unknown Storage provider in: {self.full_url}")
        return ""
示例#3
0
    def health_check(self) -> Tuple[bool, str]:
        alive = True
        error_message = None
        try:
            self._find_account(self._account_id)
        except FacebookAPIException as exc:
            logger.error(
                str(exc)
            )  # we might need some extra details, so log original exception here
            alive = False
            error_message = str(exc)

        return alive, error_message
示例#4
0
 def __init__(self,
              dataset_name: str,
              url: str,
              provider: dict,
              format: str = None,
              reader_options: str = None):
     self._dataset_name = dataset_name
     self._url = url
     self._provider = provider
     self._reader_format = format or "csv"
     self._reader_options = {}
     if reader_options:
         try:
             self._reader_options = json.loads(reader_options)
         except json.decoder.JSONDecodeError as err:
             error_msg = f"Failed to parse reader options {repr(err)}\n{reader_options}\n{traceback.format_exc()}"
             logger.error(error_msg)
             raise ConfigurationError(error_msg) from err
示例#5
0
    def load_dataframes(self, fp, skip_data=False) -> Iterable:
        """load and return the appropriate pandas dataframe.

        :param fp: file-like object to read from
        :param skip_data: limit reading data
        :return: a list of dataframe loaded from files described in the configuration
        """
        readers = {
            # pandas.read_csv additional arguments can be passed to customize how to parse csv.
            # see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
            "csv": pd.read_csv,
            # We can add option to call to pd.normalize_json to normalize semi-structured JSON data into a flat table
            # by asking user to specify how to flatten the nested columns
            "flat_json": pd.read_json,
            "html": pd.read_html,
            "excel": pd.read_excel,
            "feather": pd.read_feather,
            "parquet": pd.read_parquet,
            "orc": pd.read_orc,
            "pickle": pd.read_pickle,
        }

        try:
            reader = readers[self._reader_format]
        except KeyError as err:
            error_msg = f"Reader {self._reader_format} is not supported\n{traceback.format_exc()}"
            logger.error(error_msg)
            raise ConfigurationError(error_msg) from err

        reader_options = {**self._reader_options}
        if self._reader_format == "csv":
            reader_options["chunksize"] = 10000
            if skip_data:
                reader_options["nrows"] = 0
                reader_options["index_col"] = 0

            yield from reader(fp, **reader_options)
        else:
            yield reader(fp, **reader_options)
示例#6
0
 def log_giveup(_details):
     logger.error("Max retry limit reached")