def main(argv):

    spark = SparkSession.builder \
        .appName('AUTHOR_STATISTIC') \
        .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')

    video_type_code = FuncUtils.handle_params(argv)

    filepath = FuncUtils.create_filepath(video_type_code,
                                         'video_details/video_info_type')
    video_df = spark.read.format('csv').options(header='true').load(filepath)

    video_df = video_df.select('av号', '是否收费')
    video_df = video_df.withColumnRenamed('av号', 'av')
    video_df = video_df.withColumnRenamed('是否收费', 'is_pay')

    video_rdd = video_df.rdd.filter(lambda x: x['is_pay'].isdigit())
    pair_rdd = video_rdd.map(lambda x: (x['is_pay'], 1))
    pair_rdd = pair_rdd.reduceByKey(lambda a, b: a + b)
    pair_df = pair_rdd.map(lambda pair: Row(**create_is_pay_entity(
        pair, video_type_code))).toDF()

    pair_df.show()
示例#2
0
def main(argv):

    spark = SparkSession.builder \
        .appName('AUTHOR_STATISTIC') \
        .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')

    video_type_code = FuncUtils.handle_params(argv)

    filepath = FuncUtils.create_filepath(video_type_code, 'video_details/video_info_type')

    video_df = spark.read.format('csv').options(header='true').load(filepath)
    # video_df = None
    # for i in range(1, 15):
    #     filepath = FuncUtils.create_filepath(i, 'video_details/video_info_type')
    #     df = spark.read.format('csv').options(header='true').load(filepath)
    #     if video_df is None:
    #         video_df = df
    #     else:
    #         video_df = video_df.union(df)

    author_df = video_df.select('Up主', '播放量')
    author_rdd = video_df.rdd
    author_rdd = author_rdd.filter(lambda x: x['播放量'].isdigit())
    pair_rdd = author_rdd.map(lambda x: (x['Up主'], (int(x['播放量']), 1)))
    pair_rdd = pair_rdd.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
    pair_rdd = pair_rdd.filter(lambda x: x[1][0] != 0)
    pair_rdd = pair_rdd.map(lambda x: (x[0], x[1][0] / x[1][1]))
    pair_df = pair_rdd.map(
        lambda pair:
        Row(**create_author_entity(pair, video_type_code))
    ).toDF()
    pair_df = pair_df.sort('avg_play_count', ascending=False)

    pair_df = pair_df.limit(200)
    pair_df.show()
    pair_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data',
                             'AUTHOR_STATISTIC', 'append', mysql_conn_param)
def main(argv):
    # --- TODO ---

    global video_type_code

    spark = SparkSession.builder \
        .appName('BARRAGE_WORD_COUNT') \
        .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')


    video_type_code = FuncUtils.handle_params(argv)

    filepath = FuncUtils.create_filepath(video_type_code, 'barrage_data/barrage_type')

    video_df = spark.read.format('csv').options(header='true').load(filepath)

    text_data = video_df.select('弹幕内容')
    text_rdd = text_data.rdd
    text_rdd = text_rdd.filter(lambda x: x['弹幕内容'] is not None)
    word_data = text_rdd.flatMap(word_split)
    word_data = word_data.filter(lambda word: word not in WORD_FILTER)
    pair_data = word_data.map(lambda word: (word, 1))
    pair_data = pair_data.reduceByKey(lambda a, b: a + b)

    word_df = pair_data.map(
        lambda pair:
        Row(**create_word_entity(pair, 'barrage',  video_type_code))
    ).toDF()
    word_df.createOrReplaceTempView('WORD')
    word_df = word_df.sort('count', ascending=False)
    word_df.show()

    word_df = word_df.limit(500)
    word_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data',
                             'WORD_FREQ', 'append', mysql_conn_param)
示例#4
0
def main(argv):
    global video_type_code

    spark = SparkSession.builder \
        .appName('VIDEO_TITLE_WORD_COUNT') \
        .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')


    video_type_code = FuncUtils.handle_params(argv)

    filepath = FuncUtils.create_filepath(video_type_code, 'video_details/video_info_type')
    video_df = spark.read.format('csv').options(header='true').load(filepath)
    video_info_df = video_df.select('av号', '标题')
    video_info_df = video_info_df.withColumnRenamed('av号', 'av')
    video_info_df = video_info_df.withColumnRenamed('标题', 'title')

    title_word_data = video_info_df.rdd.filter(lambda x: x['title'] is not None).flatMap(word_split)
    title_word_data = title_word_data.filter(lambda word: word not in WORD_FILTER)
    pair_data = title_word_data.map(lambda word: (word, 1))
    pair_data = pair_data.reduceByKey(lambda a, b: a + b)

    word_df = pair_data.map(
        lambda pair:
        Row(**create_word_entity(pair, 'title', video_type_code))
    ).toDF()

    word_df.show()

    word_df = word_df.sort('count', ascending=False).limit(500)

    word_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data',
                       'WORD_FREQ', 'append', mysql_conn_param)