Пример #1
0
def run_tests():
    """
    Setup and run the doc tests.
    """
    import doctest
    from sparklingpandas.pcontext import PSparkContext

    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    globs['psc'] = PSparkContext.simple('local[4]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    globs['psc'].sc.stop()
    msg = "{0} test ran {1} failures".format(test_count, failure_count)
    try:
        # My kingdom for the letter u
        from termcolor import colored

        if failure_count:
            msg = colored(msg, 'red')
        else:
            msg = colored(msg, 'green')
        print msg
    except ImportError:
        if failure_count:
            msg = '\033[91m' + msg
        else:
            msg = '\033[92m' + msg
        print msg + '\033[0m'
    if failure_count:
        exit(-1)
Пример #2
0
 def setUp(self):
     """Setup the basic panda spark test case. This right now just creates a
     PSparkContext."""
     self._old_sys_path = list(sys.path)
     class_name = self.__class__.__name__
     self.psc = PSparkContext.simple('local[4]', class_name, batchSize=2)
     # Add a common basic input and basicpframe we can reuse in testing
     self.basicinput = [("tea", "happy"), ("water", "sad"),
                        ("coffee", "happiest"), ("tea", "water")]
     self.basiccolumns = ['magic', 'thing']
     self.basicpframe = self.psc.DataFrame(self.basicinput,
                                           columns=self.basiccolumns)
     self.basicframe = pandas.DataFrame(self.basicinput,
                                        columns=self.basiccolumns)
     # Add a numeric frame
     self.numericinput = [(1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20),
                          (8, 9)]
     self.numericpframe = self.psc.DataFrame(self.numericinput,
                                             columns=['a', 'b'])
     self.numericframe = pandas.DataFrame(self.numericinput,
                                          columns=['a', 'b'])
     # A three column numeric frame
     self.numericthreeinput = [(1, 2, -100.5), (3, 4, 93), (1, 3, 100.2),
                               (2, 6, 0.5), (3, 100, 1.5), (3, 20, 80),
                               (8, 9, 20)]
     self.numericthreepframe = self.psc.DataFrame(self.numericthreeinput,
                                                  columns=['a', 'b', 'c'])
     self.numericthreeframe = pandas.DataFrame(self.numericthreeinput,
                                               columns=['a', 'b', 'c'])
     self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")]
     self.mixedpframe = self.psc.DataFrame(self.mixedinput,
                                           columns=['a', 'b', 'c'])
     self.mixedframe = pandas.DataFrame(self.mixedinput,
                                        columns=['a', 'b', 'c'])
 def setUp(self):
     """
     Setup the basic panda spark test case. This right now just creates a
     PSparkContext.
     """
     self._old_sys_path = list(sys.path)
     class_name = self.__class__.__name__
     self.psc = PSparkContext.simple('local[4]', class_name, batchSize=2)
Пример #4
0
 def setUp(self):
     """Setup the basic panda spark test case. This right now just creates a
     PSparkContext."""
     logging.info("Setting up spark context")
     self._old_sys_path = list(sys.path)
     class_name = self.__class__.__name__
     conf = SparkConf()
     conf.set("spark.cores.max", "4")
     conf.set("spark.master", "local[4]")
     conf.set("spark.app-name", class_name)
     conf.set("spark.driver.allowMultipleContexts", "true")
     self.psc = PSparkContext.simple(conf=conf)
     # Add a common basic input and basicpframe we can reuse in testing
     self.basicinput = [
         ("tea", "happy"),
         ("water", "sad"),
         ("coffee", "happiest"),
         ("tea", "water")]
     self.basiccolumns = ['magic', 'thing']
     self.basicpframe = self.psc.DataFrame(
         self.basicinput, columns=self.basiccolumns)
     self.basicframe = pandas.DataFrame(
         self.basicinput, columns=self.basiccolumns)
     # Add a numeric frame
     self.numericinput = [
         (1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20), (8, 9)]
     self.numericpframe = self.psc.DataFrame(
         self.numericinput, columns=['a', 'b'])
     self.numericframe = pandas.DataFrame(
         self.numericinput, columns=['a', 'b'])
     # A three column numeric frame
     self.numericthreeinput = [
         (1, 2, -100.5),
         (3, 4, 93),
         (1, 3, 100.2),
         (2, 6, 0.5),
         (3, 100, 1.5),
         (3, 20, 80),
         (8, 9, 20)]
     self.numericthreepframe = self.psc.DataFrame(
         self.numericthreeinput, columns=['a', 'b', 'c'])
     self.numericthreeframe = pandas.DataFrame(
         self.numericthreeinput, columns=['a', 'b', 'c'])
     self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")]
     self.mixedpframe = self.psc.DataFrame(self.mixedinput,
                                           columns=['a', 'b', 'c'])
     self.mixedframe = pandas.DataFrame(self.mixedinput,
                                        columns=['a', 'b', 'c'])
     # Mixed NA frame
     self.mixednainput = [(1, 2, "coffee", None), (4, 5, "cheese", None)]
     self.mixednapframe = self.psc.DataFrame(self.mixednainput,
                                             columns=['a', 'b', 'c', 'd'])
     self.mixednaframe = pandas.DataFrame(self.mixednainput,
                                          columns=['a', 'b', 'c', 'd'])
 def setUp(self):
     """Setup the basic panda spark test case. This right now just creates a
     PSparkContext."""
     self._old_sys_path = list(sys.path)
     class_name = self.__class__.__name__
     self.psc = PSparkContext.simple('local[4]', class_name, batchSize=2)
     # Add a common basic input and basicpframe we can reuse in testing
     self.basicinput = [
         ("tea", "happy"),
         ("water", "sad"),
         ("coffee", "happiest"),
         ("tea", "water")]
     self.basiccolumns = ['magic', 'thing']
     self.basicpframe = self.psc.DataFrame(
         self.basicinput, columns=self.basiccolumns)
     self.basicframe = pandas.DataFrame(
         self.basicinput, columns=self.basiccolumns)
     # Add a numeric frame
     self.numericinput = [
         (1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20), (8, 9)]
     self.numericpframe = self.psc.DataFrame(
         self.numericinput, columns=['a', 'b'])
     self.numericframe = pandas.DataFrame(
         self.numericinput, columns=['a', 'b'])
     # A three column numeric frame
     self.numericthreeinput = [
         (1, 2, -100.5),
         (3, 4, 93),
         (1, 3, 100.2),
         (2, 6, 0.5),
         (3, 100, 1.5),
         (3, 20, 80),
         (8, 9, 20)]
     self.numericthreepframe = self.psc.DataFrame(
         self.numericthreeinput, columns=['a', 'b', 'c'])
     self.numericthreeframe = pandas.DataFrame(
         self.numericthreeinput, columns=['a', 'b', 'c'])
     self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")]
     self.mixedpframe = self.psc.DataFrame(self.mixedinput,
                                           columns=['a', 'b', 'c'])
     self.mixedframe = pandas.DataFrame(self.mixedinput,
                                        columns=['a', 'b', 'c'])
     # Mixed NA frame
     self.mixednainput = [(1, 2, "coffee", None), (4, 5, "cheese", None)]
     self.mixednapframe = self.psc.DataFrame(self.mixednainput,
                                             columns=['a', 'b', 'c', 'd'])
     self.mixednaframe = pandas.DataFrame(self.mixednainput,
                                          columns=['a', 'b', 'c', 'd'])
Пример #6
0
 def setUp(self):
     """Setup the basic panda spark test case. This right now just creates a
     PSparkContext."""
     logging.info("Setting up spark context")
     self._old_sys_path = list(sys.path)
     class_name = self.__class__.__name__
     conf = SparkConf()
     conf.set("spark.cores.max", "4")
     conf.set("spark.master", "local[4]")
     conf.set("spark.app-name", class_name)
     conf.set("spark.driver.allowMultipleContexts", "true")
     self.psc = PSparkContext.simple(conf=conf)
     # Add a common basic input and basicpframe we can reuse in testing
     self.basicinput = [("tea", "happy"), ("water", "sad"),
                        ("coffee", "happiest"), ("tea", "water")]
     self.basiccolumns = ['magic', 'thing']
     self.basicpframe = self.psc.DataFrame(self.basicinput,
                                           columns=self.basiccolumns)
     self.basicframe = pandas.DataFrame(self.basicinput,
                                        columns=self.basiccolumns)
     # Add a numeric frame
     self.numericinput = [(1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20),
                          (8, 9)]
     self.numericpframe = self.psc.DataFrame(self.numericinput,
                                             columns=['a', 'b'])
     self.numericframe = pandas.DataFrame(self.numericinput,
                                          columns=['a', 'b'])
     # A three column numeric frame
     self.numericthreeinput = [(1, 2, -100.5), (3, 4, 93), (1, 3, 100.2),
                               (2, 6, 0.5), (3, 100, 1.5), (3, 20, 80),
                               (8, 9, 20)]
     self.numericthreepframe = self.psc.DataFrame(self.numericthreeinput,
                                                  columns=['a', 'b', 'c'])
     self.numericthreeframe = pandas.DataFrame(self.numericthreeinput,
                                               columns=['a', 'b', 'c'])
     self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")]
     self.mixedpframe = self.psc.DataFrame(self.mixedinput,
                                           columns=['a', 'b', 'c'])
     self.mixedframe = pandas.DataFrame(self.mixedinput,
                                        columns=['a', 'b', 'c'])
     # Mixed NA frame
     self.mixednainput = [(1, 2, "coffee", None), (4, 5, "cheese", None)]
     self.mixednapframe = self.psc.DataFrame(self.mixednainput,
                                             columns=['a', 'b', 'c', 'd'])
     self.mixednaframe = pandas.DataFrame(self.mixednainput,
                                          columns=['a', 'b', 'c', 'd'])
     self.merge = merge
Пример #7
0
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
An interactive shell.

This file is designed to be launched by bin/pyspark
"""
import sparklingpandas
from sparklingpandas.pcontext import PSparkContext
from pyspark.sql import SQLContext, HiveContext
from pyspark import SparkContext

spark_ctx = SparkContext()
sqlCtx = SQLContext(spark_ctx)
hiveCtx = HiveContext(sqlCtx)
sqlContext = sqlCtx
from pyspark.sql import Row
psc = PSparkContext(spark_ctx, sqlCtx)
print("Sparkling Pandas context is available as psc\n")
Пример #8
0
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
An interactive shell.

This file is designed to be launched by bin/pyspark
"""

from sparklingpandas.pcontext import PSparkContext
psc = PSparkContext(sc)
print("Sparkling Pandas context is available as psc\n")
Пример #9
0
from pyspark import SparkContext
from sparklingpandas.pcontext import PSparkContext

# Create spark context.
sc = SparkContext("local", "example iris data")

# Create sparklingpandas context.
psc = PSparkContext(sc)

# read csv file with sparkling pandas context.
COL_NAMES = ['sepal_length',
             'sepal_width',
             'petal_length',
             'petal_width',
             'class']
iris_df = psc.read_csv("ex-data/iris.csv", names = COL_NAMES)

# Groupby Category
iris_classes = iris_df.groupby('class')
Пример #10
0
from pyspark import SparkContext
from sparklingpandas.pcontext import PSparkContext

# Create spark context.
sc = SparkContext("local", "example sfpd data")

# Create sparklingpandas context.
psc = PSparkContext(sc)

# read csv file with sparkling pandas context.
spDF = psc.read_csv("ex-data/SFPD_Incidents.csv")
spDF.groupby("Category").count().collect().to_csv(
    path_or_buf="/home/juliet/src/sparklingpandas-ex/my_csv.csv")