from typing import List million_ints: List[int] = [] for number in range(0, 1000000): million_ints.append(-1) from pympler import asizeof print(asizeof.asizeof(million_ints)) # from Chapter02.utilities02_py.domain_objects import WarcRecord million_warcs = list() for index in range(0, 1000000): million_warcs.append(WarcRecord.create_dummy()) print(asizeof.asizeof(million_warcs)) ########################################################################## spark = create_session(2, "Collection Sizes") ## (B) million_ints_rdd = spark.sparkContext.range(0, 1000000) million_ints_rdd.cache() million_ints_rdd.count() million_ints_rdd.unpersist() million_warcs_rdd = million_ints_rdd.map(lambda number: WarcRecord.create_dummy()) million_warcs_rdd.cache() million_warcs_rdd.count() from pyspark.sql import DataFrame million_ints_df: DataFrame = spark.range(0, 1000000) million_ints_df.cache() million_ints_df.count()
from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import sample_wet_loc, extract_raw_records, parse_raw_wet if __name__ == "__main__": session = create_session(3, 'WET Parser') session.sparkContext.setLogLevel( 'ERROR') # avoids printing of info messages raw_records = extract_raw_records(sample_wet_loc, session) wet_records = raw_records.flatMap(lambda record: parse_raw_wet(record)) wet_records.toDF().printSchema() print('Total # of records: ' + str(wet_records.count()))
import time from pyspark import RDD from pyspark.sql import SparkSession, DataFrame from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet if __name__ == "__main__": input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc' # ToDo: Modify path input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' # ToDo: Modify path session: SparkSession = create_session(3, 'Activity 4.2 DataFrame') warc_records: RDD = extract_raw_records( input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record)) wet_records: RDD = extract_raw_records( input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record)) from pyspark.sql.functions import col warc_records_df: DataFrame = warc_records.toDF().select( col('target_uri'), col('language')) wet_records_df: DataFrame = wet_records.toDF().select( col('target_uri'), col('plain_text')) joined_df = warc_records_df.join(wet_records_df, ['target_uri']) spanish_records = joined_df.filter(col('language') == 'es') time.sleep(10 * 60) # For exploring WebUI
import time from pyspark import RDD from pyspark.sql import SparkSession, DataFrame from pyspark.sql.functions import count, sum, col from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc if __name__ == "__main__": session: SparkSession = create_session(3, "Query Plans") session.sparkContext.setLogLevel("TRACE") lang_tag_mapping = [('en', 'english'), ('pt-pt', 'portugese'), ('cs', 'czech'), ('de', 'german'), ('es', 'spanish'), ('eu', 'basque'), ('it', 'italian'), ('hu', 'hungarian'), ('pt-br', 'portugese'), ('fr', 'french'), ('en-US', 'english'), ('zh-TW', 'chinese')] lang_tag_df = session.createDataFrame(lang_tag_mapping, ['tag', 'language']) session.createDataFrame(lang_tag_mapping).show() raw_records = extract_raw_records(sample_warc_loc, session) warc_records_rdd: RDD = raw_records.flatMap(parse_raw_warc) warc_records_df: DataFrame = warc_records_rdd.toDF()\ .select(col('target_uri'), col('language'))\ .filter(col('language') != '') aggregated = warc_records_df\ .groupBy(col('language'))\ .agg(count(col('target_uri')))\ .withColumnRenamed('language', 'tag') joined_df = aggregated.join(lang_tag_df, ['tag'])
from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc if __name__ == "__main__": session = create_session(3, 'Spark-Submit') session.sparkContext.setLogLevel( 'ERROR') # avoids printing of info messages raw_records = extract_raw_records(sample_warc_loc, session) warc_records = raw_records.flatMap(lambda record: parse_raw_warc(record)) warc_records.toDF().printSchema() print('Total # of records: ' + str(warc_records.count()))
from pyspark.sql import SparkSession from pyspark.ml.common import _java2py from Chapter01.utilities01_py.helper_python import create_session # ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit --driver-class-path ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Exercise4_06/Exercise4_06.py if __name__ == "__main__": session: SparkSession = create_session(2, "PySpark <> JVM") session.sparkContext.setLogLevel('ERROR') python_rdd = session.sparkContext.range(0, 5) java_rdd = session.sparkContext._jvm.SerDe.pythonToJava( python_rdd._jrdd, True) mapped_java_rdd = session.sparkContext._jvm.Exercise4_06.ScalaObject.executeInScala( java_rdd) mapped_python_rdd = _java2py(session.sparkContext, mapped_java_rdd) print(mapped_python_rdd.collect())
import time from pyspark import RDD from pyspark.sql import SparkSession, DataFrame from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet if __name__ == "__main__": input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc' # ToDo: Modify path input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' # ToDo: Modify path session: SparkSession = create_session(3, 'Activity 4.2 RDD') warc_records: RDD = extract_raw_records( input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record)) wet_records: RDD = extract_raw_records( input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record)) pair_warc: RDD = warc_records.map(lambda warc: (warc.target_uri, warc.language)) pair_wet: RDD = wet_records.map(lambda wet: (wet.target_uri, wet.plain_text)) joined = pair_warc.join(pair_wet) spanish_records = joined.filter(lambda triple: triple[1][0] == 'es') print(spanish_records.count()) # 133 time.sleep(10 * 60) # For exploring WebUI
from sys import argv from pyspark.ml.common import _java2py from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc # ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit --driver-class-path ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar:/Users/a/.m2/repository/com/google/guava/guava/28.2-jre/guava-28.2-jre.jar:/Users/a/.m2/repository/org/apache/commons/commons-compress/1.20/commons-compress-1.20.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Activity4_03/Activity4_03.py ~/Output_Act4_3 if __name__ == "__main__": output_dir = argv[1] session = create_session(3, 'WARC Parser') warc_records = extract_raw_records(sample_warc_loc, session) \ .flatMap(lambda record: parse_raw_warc(record)) \ .filter(lambda record: record.warc_type == 'response') plaintexts_rdd = warc_records.map(lambda record: record.html_source) java_rdd = session.sparkContext._jvm.SerDe.pythonToJava( plaintexts_rdd._jrdd, True) tagged_java_rdd = session.sparkContext._jvm.Activity4_03.Activity4_03.tagJavaRDD( java_rdd) tagged_python_rdd = _java2py(session.sparkContext, tagged_java_rdd) tagged_python_rdd.saveAsTextFile(output_dir)
from pyspark.sql import SparkSession from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc import time import sys if __name__ == "__main__": threads = int(sys.argv[1]) warc_loc = sample_warc_loc session: SparkSession = create_session(threads, "PySpark Design") raw_records = extract_raw_records(warc_loc, session) warc_records = raw_records \ .flatMap(lambda record: parse_raw_warc(record)) print(warc_records.getNumPartitions()) warc_records.cache() print(warc_records.count()) time.sleep(60 * 10)
import time from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet if __name__ == "__main__": input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc' # ToDo: Modify path input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' # ToDo: Modify path session = create_session(3, 'Activity 3.1') raw_records_warc = extract_raw_records(input_loc_warc, session) warc_records = raw_records_warc.flatMap( lambda record: parse_raw_warc(record)) raw_records_wet = extract_raw_records(input_loc_wet, session) wet_records = raw_records_wet.flatMap(lambda record: parse_raw_wet(record)) pair_warc = warc_records.map(lambda warc: (warc.target_uri, ( warc.warc_type, warc.record_id, warc.content_type, warc.block_digest, warc.date_s, warc.content_length, warc.info_id, warc.concurrent_to, warc.ip, warc.payload_digest, warc.payload_type, warc. html_content_type, warc.language, warc.html_length, warc.html_source))) pair_wet = wet_records.map(lambda wet: (wet.target_uri, wet.plain_text)) joined = pair_warc.join(pair_wet, numPartitions=7) print(joined.count()) time.sleep(10 * 60) # For exploring WebUI
from pyspark.sql import SparkSession from Chapter01.utilities01_py.helper_python import create_session from pyspark.ml.common import _java2py if __name__ == "__main__": threads = 2 session: SparkSession = create_session(threads, "PySpark <> JVM") python_rdd = session.sparkContext.range(0, 5) java_rdd = session.sparkContext._jvm.SerDe.pythonToJava( python_rdd._jrdd, True) mapped_java_rdd = session.sparkContext._jvm.Exercise4_06.ScalaObject.executeInScala( java_rdd) mapped_python_rdd = _java2py(session.sparkContext, mapped_java_rdd) print(mapped_python_rdd.collect())
return process_id, current_uri def trivial_filter(processid_uri: (int, str)) -> bool: new_process_id = str(getpid()) timepoint = str(datetime.now()) print('3@ filter in process {} at {} processing {}'.format(new_process_id, timepoint, processid_uri[1])) return True def quick_print(processid_uri: (int, str)) -> (int, int): new_process_id = str(getpid()) timepoint = str(datetime.now()) print('4@ map2 in process {} at {} processing {}'.format(new_process_id, timepoint, processid_uri[1])) return processid_uri[0], new_process_id if __name__ == "__main__": session: SparkSession = create_session(3, "Wave exploration") raw_records = extract_raw_records(sample_warc_loc, session) warc_records = raw_records \ .flatMap(parse_raw_warc) process_ids_rdd = warc_records\ .map(fall_asleep)\ .filter(trivial_filter)\ .map(quick_print) distinct_process_ids: List[Tuple[int, int]] = process_ids_rdd.distinct().collect() print(distinct_process_ids)