def test_perrecord_vs_perpartition(self): raw_records = extract_raw_records(sample_warc_loc, self.spark) warc_records = raw_records \ .flatMap(lambda record: parse_raw_warc(record)) def map_function(_): new_heavy_object = HeavyObject('map') objet_id = new_heavy_object.get_id() return objet_id ids_after_map: RDD = warc_records.map(map_function) def partition_function(partition): new_heavy_object = HeavyObject('mapPartition') object_id = new_heavy_object.get_id() for _ in partition: yield object_id ids_after_mappartitions: RDD = warc_records.mapPartitions( partition_function) unique_ids_map: List[int] = ids_after_map.distinct().collect() unique_ids_mappartitions: List[int] = ids_after_mappartitions.distinct( ).collect() print('@' * 50) number_of_records: int = warc_records.count() number_of_partitions: int = warc_records.getNumPartitions() print('@@ Number of records: {}'.format(number_of_records)) print('@@ Number of partitions: {}'.format(number_of_partitions)) self.assertGreater(len(unique_ids_map), len(unique_ids_mappartitions)) self.assertGreaterEqual(number_of_partitions, len(unique_ids_mappartitions)) print(unique_ids_map) print(unique_ids_mappartitions)
from pyspark import RDD from pyspark.sql import SparkSession from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet import time if __name__ == "__main__": session: SparkSession = SparkSession.builder \ .master('local[{}]'.format(3)) \ .appName('Caching & Eviction') \ .getOrCreate() session.sparkContext.setLogLevel('DEBUG') input_loc_warc = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc' input_loc_wet = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' raw_records_warc: RDD = extract_raw_records(input_loc_warc, session) warc_records: RDD = raw_records_warc \ .flatMap(lambda record: parse_raw_warc(record)) raw_records_wet: RDD = extract_raw_records(input_loc_wet, session) wet_records: RDD = raw_records_wet \ .flatMap(lambda record: parse_raw_wet(record)) warc_records.cache() wet_records.cache() uri_keyed_warc = warc_records.map(lambda record: (record.target_uri, record)) uri_keyed_wet = wet_records.map(lambda record: (record.target_uri, record)) joined = uri_keyed_warc.join(uri_keyed_wet)
import time from pyspark import RDD from pyspark.sql import SparkSession, DataFrame from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet if __name__ == "__main__": input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc' # ToDo: Modify path input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' # ToDo: Modify path session: SparkSession = create_session(3, 'Activity 4.2 DataFrame') warc_records: RDD = extract_raw_records( input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record)) wet_records: RDD = extract_raw_records( input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record)) from pyspark.sql.functions import col warc_records_df: DataFrame = warc_records.toDF().select( col('target_uri'), col('language')) wet_records_df: DataFrame = wet_records.toDF().select( col('target_uri'), col('plain_text')) joined_df = warc_records_df.join(wet_records_df, ['target_uri']) spanish_records = joined_df.filter(col('language') == 'es') time.sleep(10 * 60) # For exploring WebUI
def tag_records(partition): for warc_record in partition: parser = BeautifulSoup(warc_record.html_source, 'html.parser') plaintext = ' '.join(parser.stripped_strings) plaintext_stripped = sub('\\s+', ' ', plaintext) if plaintext_stripped is None or plaintext_stripped == '': yield () # empty tuple else: cleaned_text = ''.join(x for x in plaintext_stripped if x in printable) _, _, details = pycld2.detect(cleaned_text) (languageName, languageCode, percent, score) = details[0] yield warc_record.target_uri, languageCode, str(score) if __name__ == "__main__": session: SparkSession = SparkSession.builder \ .appName('Improved Crawl Tagger') \ .getOrCreate() input_file = argv[1] output_dir = argv[2] warc_records = extract_raw_records(input_file, session) \ .flatMap(lambda record: parse_raw_warc(record)) \ .filter(lambda record: record.warc_type == 'response') tagged_texts_rdd = warc_records \ .mapPartitions(tag_records) \ .filter(lambda record: record != ()) tagged_texts_rdd.saveAsTextFile(output_dir)
from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import sample_wet_loc, extract_raw_records, parse_raw_wet if __name__ == "__main__": session = create_session(3, 'WET Parser') session.sparkContext.setLogLevel( 'ERROR') # avoids printing of info messages raw_records = extract_raw_records(sample_wet_loc, session) wet_records = raw_records.flatMap(lambda record: parse_raw_wet(record)) wet_records.toDF().printSchema() print('Total # of records: ' + str(wet_records.count()))
def parse_method(text: str) -> Tuple[WarcRecord]: parsed_raw_warc = parse_raw_warc(text) # crasher = 5 / 0 # ToDo: Uncomment # print(crasher) # ToDo: Uncomment return parsed_raw_warc if __name__ == "__main__": session: SparkSession = SparkSession.builder \ .master('local[3, 3]') \ .appName('Failure Exploration') \ .getOrCreate() session.sparkContext.setLogLevel('ERROR') session.sparkContext._gateway.start_callback_server() java_process = launch_gateway() gateway = JavaGateway( gateway_parameters=GatewayParameters(port=java_process), callback_server_parameters=CallbackServerParameters(port=0)) listener = RetryListener() session.sparkContext._jsc.sc().addSparkListener(listener) input_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc' # ToDo: Change path raw_records = extract_raw_records(input_warc, session) warc_records = raw_records.flatMap(parse_method) print(warc_records.count()) gateway.shutdown_callback_server()
from sys import argv from pyspark.ml.common import _java2py from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc # ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit --driver-class-path ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar:/Users/a/.m2/repository/com/google/guava/guava/28.2-jre/guava-28.2-jre.jar:/Users/a/.m2/repository/org/apache/commons/commons-compress/1.20/commons-compress-1.20.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Activity4_03/Activity4_03.py ~/Output_Act4_3 if __name__ == "__main__": output_dir = argv[1] session = create_session(3, 'WARC Parser') warc_records = extract_raw_records(sample_warc_loc, session) \ .flatMap(lambda record: parse_raw_warc(record)) \ .filter(lambda record: record.warc_type == 'response') plaintexts_rdd = warc_records.map(lambda record: record.html_source) java_rdd = session.sparkContext._jvm.SerDe.pythonToJava( plaintexts_rdd._jrdd, True) tagged_java_rdd = session.sparkContext._jvm.Activity4_03.Activity4_03.tagJavaRDD( java_rdd) tagged_python_rdd = _java2py(session.sparkContext, tagged_java_rdd) tagged_python_rdd.saveAsTextFile(output_dir)
from pyspark.sql import SparkSession from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc import time import sys if __name__ == "__main__": threads = int(sys.argv[1]) warc_loc = sample_warc_loc session: SparkSession = create_session(threads, "PySpark Design") raw_records = extract_raw_records(warc_loc, session) warc_records = raw_records \ .flatMap(lambda record: parse_raw_warc(record)) print(warc_records.getNumPartitions()) warc_records.cache() print(warc_records.count()) time.sleep(60 * 10)
from pyspark.sql import SparkSession if __name__ == "__main__": # input = sample_warc_loc spark: SparkSession = SparkSession.builder \ .appName('Activity 2.1') \ .getOrCreate() spark.sparkContext.setLogLevel('ERROR') # avoids printing of info messages from operator import add from collections import defaultdict from typing import Dict from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc input = "/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc" warc_records = extract_raw_records( input, spark).flatMap(lambda record: parse_raw_warc(record)) # print(warc_records.count()) keyed_by_language = warc_records.filter( lambda rec: rec.language != '').map(lambda rec: (rec.language, 1)) language_map: Dict[str, int] = keyed_by_language.reduceByKey( add).collectAsMap() ## language_list = keyed_by_language.reduceByKey(add).collect() ## language_map: Dict[str, int] = defaultdict(int) ## for key, value in language_list: ## ... language_map[key] += value ## language_map # warc_records.filter(lambda rec: rec.language != '').map(lambda rec: rec.language).countByValue() sorted_language_list = [
from pyspark.sql import SparkSession from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc from Chapter02.Exercise2_06.Exercise2_06 import heavy_computation spark: SparkSession = SparkSession.builder \ .appName('SubmitWithMaster') \ .getOrCreate() raw_records = extract_raw_records(sample_warc_loc, spark) warc_records = raw_records.flatMap(parse_raw_warc) invoked_heavy_rdd = warc_records.map(lambda record: heavy_computation()) print(invoked_heavy_rdd.collect())