示例#1
0
    def load_csv(self,
                 path=None,
                 provider=None,
                 *args,
                 sep=None,
                 header=None,
                 **kwargs):

        #return None
        obj = None

        md = Resource(path, provider, sep=sep, header=header, **kwargs)

        # download if necessary
        md = get_local(md)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['header'] = options.get('header') or True
        options['inferSchema'] = options.get('inferSchema') or True
        options['sep'] = options.get('sep') or ','

        local = self.is_spark_local()

        # start the timer for logging
        ts_start = timer()
        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).csv(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})

                df = pd.read_csv(md['url'],
                                 sep=options['sep'],
                                 header=options['header'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).csv(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        self.load_log(md, options, ts_start)
        return obj
示例#2
0
    def load_parquet(self,
                     path=None,
                     provider=None,
                     *args,
                     mergeSchema=None,
                     **kwargs):
        obj = None

        md = Resource(path,
                      provider,
                      format='parquet',
                      mergeSchema=mergeSchema,
                      **kwargs)

        # download if necessary
        md = get_local(md)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mergeSchema'] = options.get('mergeSchema') or True

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).parquet(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})
                #fallback to the pandas reader, then convert to spark
                df = pd.read_parquet(md['url'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(str(e), extra={'md': md})

        return obj