def run_tests(): """ Setup and run the doc tests. """ import doctest from sparklingpandas.pcontext import PSparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: globs['psc'] = PSparkContext.simple('local[4]', 'PythonTest', batchSize=2) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['psc'].sc.stop() msg = "{0} test ran {1} failures".format(test_count, failure_count) try: # My kingdom for the letter u from termcolor import colored if failure_count: msg = colored(msg, 'red') else: msg = colored(msg, 'green') print msg except ImportError: if failure_count: msg = '\033[91m' + msg else: msg = '\033[92m' + msg print msg + '\033[0m' if failure_count: exit(-1)
def setUp(self): """Setup the basic panda spark test case. This right now just creates a PSparkContext.""" self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ self.psc = PSparkContext.simple('local[4]', class_name, batchSize=2) # Add a common basic input and basicpframe we can reuse in testing self.basicinput = [("tea", "happy"), ("water", "sad"), ("coffee", "happiest"), ("tea", "water")] self.basiccolumns = ['magic', 'thing'] self.basicpframe = self.psc.DataFrame(self.basicinput, columns=self.basiccolumns) self.basicframe = pandas.DataFrame(self.basicinput, columns=self.basiccolumns) # Add a numeric frame self.numericinput = [(1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20), (8, 9)] self.numericpframe = self.psc.DataFrame(self.numericinput, columns=['a', 'b']) self.numericframe = pandas.DataFrame(self.numericinput, columns=['a', 'b']) # A three column numeric frame self.numericthreeinput = [(1, 2, -100.5), (3, 4, 93), (1, 3, 100.2), (2, 6, 0.5), (3, 100, 1.5), (3, 20, 80), (8, 9, 20)] self.numericthreepframe = self.psc.DataFrame(self.numericthreeinput, columns=['a', 'b', 'c']) self.numericthreeframe = pandas.DataFrame(self.numericthreeinput, columns=['a', 'b', 'c']) self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")] self.mixedpframe = self.psc.DataFrame(self.mixedinput, columns=['a', 'b', 'c']) self.mixedframe = pandas.DataFrame(self.mixedinput, columns=['a', 'b', 'c'])
def setUp(self): """ Setup the basic panda spark test case. This right now just creates a PSparkContext. """ self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ self.psc = PSparkContext.simple('local[4]', class_name, batchSize=2)
def setUp(self): """Setup the basic panda spark test case. This right now just creates a PSparkContext.""" logging.info("Setting up spark context") self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf() conf.set("spark.cores.max", "4") conf.set("spark.master", "local[4]") conf.set("spark.app-name", class_name) conf.set("spark.driver.allowMultipleContexts", "true") self.psc = PSparkContext.simple(conf=conf) # Add a common basic input and basicpframe we can reuse in testing self.basicinput = [ ("tea", "happy"), ("water", "sad"), ("coffee", "happiest"), ("tea", "water")] self.basiccolumns = ['magic', 'thing'] self.basicpframe = self.psc.DataFrame( self.basicinput, columns=self.basiccolumns) self.basicframe = pandas.DataFrame( self.basicinput, columns=self.basiccolumns) # Add a numeric frame self.numericinput = [ (1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20), (8, 9)] self.numericpframe = self.psc.DataFrame( self.numericinput, columns=['a', 'b']) self.numericframe = pandas.DataFrame( self.numericinput, columns=['a', 'b']) # A three column numeric frame self.numericthreeinput = [ (1, 2, -100.5), (3, 4, 93), (1, 3, 100.2), (2, 6, 0.5), (3, 100, 1.5), (3, 20, 80), (8, 9, 20)] self.numericthreepframe = self.psc.DataFrame( self.numericthreeinput, columns=['a', 'b', 'c']) self.numericthreeframe = pandas.DataFrame( self.numericthreeinput, columns=['a', 'b', 'c']) self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")] self.mixedpframe = self.psc.DataFrame(self.mixedinput, columns=['a', 'b', 'c']) self.mixedframe = pandas.DataFrame(self.mixedinput, columns=['a', 'b', 'c']) # Mixed NA frame self.mixednainput = [(1, 2, "coffee", None), (4, 5, "cheese", None)] self.mixednapframe = self.psc.DataFrame(self.mixednainput, columns=['a', 'b', 'c', 'd']) self.mixednaframe = pandas.DataFrame(self.mixednainput, columns=['a', 'b', 'c', 'd'])
def setUp(self): """Setup the basic panda spark test case. This right now just creates a PSparkContext.""" self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ self.psc = PSparkContext.simple('local[4]', class_name, batchSize=2) # Add a common basic input and basicpframe we can reuse in testing self.basicinput = [ ("tea", "happy"), ("water", "sad"), ("coffee", "happiest"), ("tea", "water")] self.basiccolumns = ['magic', 'thing'] self.basicpframe = self.psc.DataFrame( self.basicinput, columns=self.basiccolumns) self.basicframe = pandas.DataFrame( self.basicinput, columns=self.basiccolumns) # Add a numeric frame self.numericinput = [ (1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20), (8, 9)] self.numericpframe = self.psc.DataFrame( self.numericinput, columns=['a', 'b']) self.numericframe = pandas.DataFrame( self.numericinput, columns=['a', 'b']) # A three column numeric frame self.numericthreeinput = [ (1, 2, -100.5), (3, 4, 93), (1, 3, 100.2), (2, 6, 0.5), (3, 100, 1.5), (3, 20, 80), (8, 9, 20)] self.numericthreepframe = self.psc.DataFrame( self.numericthreeinput, columns=['a', 'b', 'c']) self.numericthreeframe = pandas.DataFrame( self.numericthreeinput, columns=['a', 'b', 'c']) self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")] self.mixedpframe = self.psc.DataFrame(self.mixedinput, columns=['a', 'b', 'c']) self.mixedframe = pandas.DataFrame(self.mixedinput, columns=['a', 'b', 'c']) # Mixed NA frame self.mixednainput = [(1, 2, "coffee", None), (4, 5, "cheese", None)] self.mixednapframe = self.psc.DataFrame(self.mixednainput, columns=['a', 'b', 'c', 'd']) self.mixednaframe = pandas.DataFrame(self.mixednainput, columns=['a', 'b', 'c', 'd'])
def setUp(self): """Setup the basic panda spark test case. This right now just creates a PSparkContext.""" logging.info("Setting up spark context") self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf() conf.set("spark.cores.max", "4") conf.set("spark.master", "local[4]") conf.set("spark.app-name", class_name) conf.set("spark.driver.allowMultipleContexts", "true") self.psc = PSparkContext.simple(conf=conf) # Add a common basic input and basicpframe we can reuse in testing self.basicinput = [("tea", "happy"), ("water", "sad"), ("coffee", "happiest"), ("tea", "water")] self.basiccolumns = ['magic', 'thing'] self.basicpframe = self.psc.DataFrame(self.basicinput, columns=self.basiccolumns) self.basicframe = pandas.DataFrame(self.basicinput, columns=self.basiccolumns) # Add a numeric frame self.numericinput = [(1, 2), (3, 4), (1, 3), (2, 6), (3, 100), (3, 20), (8, 9)] self.numericpframe = self.psc.DataFrame(self.numericinput, columns=['a', 'b']) self.numericframe = pandas.DataFrame(self.numericinput, columns=['a', 'b']) # A three column numeric frame self.numericthreeinput = [(1, 2, -100.5), (3, 4, 93), (1, 3, 100.2), (2, 6, 0.5), (3, 100, 1.5), (3, 20, 80), (8, 9, 20)] self.numericthreepframe = self.psc.DataFrame(self.numericthreeinput, columns=['a', 'b', 'c']) self.numericthreeframe = pandas.DataFrame(self.numericthreeinput, columns=['a', 'b', 'c']) self.mixedinput = [(1, 2, "coffee"), (4, 5, "cheese")] self.mixedpframe = self.psc.DataFrame(self.mixedinput, columns=['a', 'b', 'c']) self.mixedframe = pandas.DataFrame(self.mixedinput, columns=['a', 'b', 'c']) # Mixed NA frame self.mixednainput = [(1, 2, "coffee", None), (4, 5, "cheese", None)] self.mixednapframe = self.psc.DataFrame(self.mixednainput, columns=['a', 'b', 'c', 'd']) self.mixednaframe = pandas.DataFrame(self.mixednainput, columns=['a', 'b', 'c', 'd']) self.merge = merge
# The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ An interactive shell. This file is designed to be launched by bin/pyspark """ import sparklingpandas from sparklingpandas.pcontext import PSparkContext from pyspark.sql import SQLContext, HiveContext from pyspark import SparkContext spark_ctx = SparkContext() sqlCtx = SQLContext(spark_ctx) hiveCtx = HiveContext(sqlCtx) sqlContext = sqlCtx from pyspark.sql import Row psc = PSparkContext(spark_ctx, sqlCtx) print("Sparkling Pandas context is available as psc\n")
# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ An interactive shell. This file is designed to be launched by bin/pyspark """ from sparklingpandas.pcontext import PSparkContext psc = PSparkContext(sc) print("Sparkling Pandas context is available as psc\n")
from pyspark import SparkContext from sparklingpandas.pcontext import PSparkContext # Create spark context. sc = SparkContext("local", "example iris data") # Create sparklingpandas context. psc = PSparkContext(sc) # read csv file with sparkling pandas context. COL_NAMES = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'] iris_df = psc.read_csv("ex-data/iris.csv", names = COL_NAMES) # Groupby Category iris_classes = iris_df.groupby('class')
from pyspark import SparkContext from sparklingpandas.pcontext import PSparkContext # Create spark context. sc = SparkContext("local", "example sfpd data") # Create sparklingpandas context. psc = PSparkContext(sc) # read csv file with sparkling pandas context. spDF = psc.read_csv("ex-data/SFPD_Incidents.csv") spDF.groupby("Category").count().collect().to_csv( path_or_buf="/home/juliet/src/sparklingpandas-ex/my_csv.csv")