import pandas as pd import numpy as np from pyspark.shell import spark # Create a Spark Dataframe (or Dataset) myRange = spark.range(1000).toDF("number") myRange.show() # 2 modulous of the create df divisBy2 = myRange.where("number % 2 = 0") divisBy2.show() # Default RDD partition creation print('Number of partitions created: {}'.format( myRange.rdd.getNumPartitions())) print('Partitioner: {}'.format(myRange.rdd.partitioner)) print('Partitions structure: {}'.format(myRange.rdd.glom().collect())) print('Total transactions RDD instances: {}'.format(myRange.rdd.count())) # Re-partitioned increased to 8 repartitioned = myRange.repartition(8) print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions())) print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect())) print('Total repartitioned RDD instances: {}'.format( repartitioned.rdd.count())) repartitioned.show() # Re-partitioning by specifying the column repartitioned = myRange.repartition('number') print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions())) print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect()))
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import time from pyspark.shell import spark from pyspark.sql.functions import col, udf numRows = 1000000000 batchSize = 10000 spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', str(batchSize)) df = spark.range(1, numRows + 1, numPartitions=1).select(col('id').alias('a')) @udf("int") def inc(x): return x + 1 beg_time = time.time() df = df.select(inc('a').alias('a')) result = df.select('a').filter(df.a < 3).head() print("PySpark Python UDF inc() consume time: " + str(time.time() - beg_time))
import sys from pyspark.shell import spark print(sys.version) print("Arguments: \n" + str(sys.argv)) try: num = int(sys.argv[1]) print("Custom number passed in args: " + str(num)) except (ValueError, IndexError): num = 1000 print("Can't process as number: " + sys.argv[1]) # Checking if f-string are available (python>=3.6) print(f"Will raise {num} to the power of 3...") cube = spark.range(num * num * num).count() print(f"{num} ^ 3 = {cube}")