Пример #1
0
import pandas as pd
import numpy as np
from pyspark.shell import spark

# Create a Spark Dataframe (or Dataset)
myRange = spark.range(1000).toDF("number")
myRange.show()

# 2 modulous of the create df
divisBy2 = myRange.where("number % 2 = 0")
divisBy2.show()

# Default RDD partition creation
print('Number of partitions created: {}'.format(
    myRange.rdd.getNumPartitions()))
print('Partitioner: {}'.format(myRange.rdd.partitioner))
print('Partitions structure: {}'.format(myRange.rdd.glom().collect()))
print('Total transactions RDD instances: {}'.format(myRange.rdd.count()))

# Re-partitioned increased to 8
repartitioned = myRange.repartition(8)
print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions()))
print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect()))
print('Total repartitioned RDD instances: {}'.format(
    repartitioned.rdd.count()))
repartitioned.show()

# Re-partitioning by specifying the column
repartitioned = myRange.repartition('number')
print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions()))
print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect()))
Пример #2
0
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import time

from pyspark.shell import spark
from pyspark.sql.functions import col, udf

numRows = 1000000000
batchSize = 10000

spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', str(batchSize))
df = spark.range(1, numRows + 1, numPartitions=1).select(col('id').alias('a'))


@udf("int")
def inc(x):
    return x + 1


beg_time = time.time()
df = df.select(inc('a').alias('a'))
result = df.select('a').filter(df.a < 3).head()
print("PySpark Python UDF inc() consume time: " + str(time.time() - beg_time))
import sys

from pyspark.shell import spark

print(sys.version)
print("Arguments: \n" + str(sys.argv))

try:
    num = int(sys.argv[1])
    print("Custom number passed in args: " + str(num))
except (ValueError, IndexError):
    num = 1000
    print("Can't process as number: " + sys.argv[1])

# Checking if f-string are available (python>=3.6)
print(f"Will raise {num} to the power of 3...")

cube = spark.range(num * num * num).count()
print(f"{num} ^ 3 = {cube}")