Exemplo n.º 1
0
    print("Did you remember to build the project by running 'make'?",
          file=sys.stderr)
    print(
        "Alternatively perhaps you tried to copy this program out without it's adjacent libraries?",
        file=sys.stderr)
    sys.exit(4)
    # com.databricks:spark-avro_2.10:2.0.1 - 2.0.1 is for Spark 1.4+
    # you can edit this bit if you need to run it on Spark 1.3:
    # https://github.com/databricks/spark-avro#linking
# Must set spark-avro package to 3.0.0+ if using Spark 2.0
# for Spark < 2.0 it results in Exception:
# Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.execution.datasources.FileFormat
#os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-avro_2.10:3.0.0 %s' \
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-avro_2.10:2.0.1 %s' \
                                    % os.getenv('PYSPARK_SUBMIT_ARGS', '')
pyspark_path()
from pyspark import SparkContext  # pylint: disable=wrong-import-position,import-error
from pyspark import SparkConf  # pylint: disable=wrong-import-position,import-error
from pyspark.sql import SQLContext  # pylint: disable=wrong-import-position,import-error

__author__ = 'Hari Sekhon'
__version__ = '0.8.0'


class SparkAvroToParquet(CLI):
    def __init__(self):
        # Python 2.x
        super(SparkAvroToParquet, self).__init__()
        # Python 3.x
        # super().__init__()
        # logging.config.fileConfig(os.path.join(libdir, 'resources', 'logging.conf'))
Exemplo n.º 2
0
sys.path.append(libdir)
try:
    # pylint: disable=wrong-import-position
    from harisekhon.utils import log, isMinVersion, support_msg, isVersionLax, die, getenv, pyspark_path
    from harisekhon import CLI
except ImportError as _:
    print('module import failed: %s' % _, file=sys.stderr)
    print("Did you remember to build the project by running 'make'?", file=sys.stderr)
    print("Alternatively perhaps you tried to copy this program out without it's adjacent libraries?", file=sys.stderr)
    sys.exit(4)
                                    # com.databricks:spark-avro_2.10:2.0.1 - 2.0.1 is for Spark 1.4+
                                    # you can edit this bit if you need to run it on Spark 1.3:
                                    # https://github.com/databricks/spark-avro#linking
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-avro_2.10:2.0.1 %s' \
                                    % os.getenv('PYSPARK_SUBMIT_ARGS', '')
pyspark_path()
from pyspark import SparkContext    # pylint: disable=wrong-import-position,import-error
from pyspark import SparkConf       # pylint: disable=wrong-import-position,import-error
from pyspark.sql import SQLContext  # pylint: disable=wrong-import-position,import-error

__author__ = 'Hari Sekhon'
__version__ = '0.7.0'

class SparkJSONToAvro(CLI):

    def __init__(self):
        # Python 2.x
        super(SparkJSONToAvro, self).__init__()
        # Python 3.x
        # super().__init__()
        # logging.config.fileConfig(os.path.join(libdir, 'resources', 'logging.conf'))