Exemplo n.º 1
0
import time

if __name__ == "__main__":
    session: SparkSession = SparkSession.builder \
        .master('local[{}]'.format(3)) \
        .appName('Caching & Eviction') \
        .getOrCreate()
    session.sparkContext.setLogLevel('DEBUG')

    input_loc_warc = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc'
    input_loc_wet = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'

    raw_records_warc: RDD = extract_raw_records(input_loc_warc, session)
    warc_records: RDD = raw_records_warc \
        .flatMap(lambda record: parse_raw_warc(record))

    raw_records_wet: RDD = extract_raw_records(input_loc_wet, session)
    wet_records: RDD = raw_records_wet \
        .flatMap(lambda record: parse_raw_wet(record))

    warc_records.cache()
    wet_records.cache()

    uri_keyed_warc = warc_records.map(lambda record:
                                      (record.target_uri, record))
    uri_keyed_wet = wet_records.map(lambda record: (record.target_uri, record))
    joined = uri_keyed_warc.join(uri_keyed_wet)

    print(joined.count())
    time.sleep(60 * 10)
import time
from pyspark import RDD
from pyspark.sql import SparkSession, DataFrame
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet

if __name__ == "__main__":
    input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Modify path
    input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'  # ToDo: Modify path
    session: SparkSession = create_session(3, 'Activity 4.2 DataFrame')

    warc_records: RDD = extract_raw_records(
        input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record))
    wet_records: RDD = extract_raw_records(
        input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record))

    from pyspark.sql.functions import col
    warc_records_df: DataFrame = warc_records.toDF().select(
        col('target_uri'), col('language'))
    wet_records_df: DataFrame = wet_records.toDF().select(
        col('target_uri'), col('plain_text'))

    joined_df = warc_records_df.join(wet_records_df, ['target_uri'])
    spanish_records = joined_df.filter(col('language') == 'es')

    time.sleep(10 * 60)  # For exploring WebUI
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_wet_loc, extract_raw_records, parse_raw_wet

if __name__ == "__main__":
    session = create_session(3, 'WET Parser')
    session.sparkContext.setLogLevel(
        'ERROR')  # avoids printing of info messages

    raw_records = extract_raw_records(sample_wet_loc, session)
    wet_records = raw_records.flatMap(lambda record: parse_raw_wet(record))

    wet_records.toDF().printSchema()
    print('Total # of records: ' + str(wet_records.count()))