from dip.util import timetool import sys import random reload(sys) sys.setdefaultencoding("utf-8") import re from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType import json import time conf = SparkConf().setAppName( "app_picserversweibof6vwt_wapvideodownload_to_hdfs") sc = SparkContext(conf=conf) hc = HiveContext(sc) try: source = sc.textFile( "/user/hdfs/rawlog/app_picserversweibof6vwt_wapvideodownload/" + timetool.getHDFSDayDir(sys.argv[1])) pattern = re.compile("^([^`]*)`([^`]*)") def lineParse(line): matcher = pattern.match(line) if not matcher: return None
from pyspark import SparkConf from dip.spark import SparkContext from pyspark.sql import HiveContext conf = SparkConf().setAppName("spark_parse") sc = SparkContext(conf=conf) hc = HiveContext(sc) def printRows(rows): for row in rows: print row rows = sc.extract_text_to_arr("hdfs://dip.cdh5.dev:8020/user/yurun/text", "delimiter", " ", [str, int, str, str, str], lambda words: words[0] == "1").collect() printRows(rows) sc.extract_text_to_arr("hdfs://dip.cdh5.dev:8020/user/yurun/text", "regex", "(.*) (.*) (.*) (.*) (.*)", filter=lambda words: True).transform_arr(lambda words: [words[0].upper()], [int], lambda words: words[0] == 1).load_arr_to_table(hc, "temp_table", [("first", int, False)]) rows = hc.sql("select * from temp_table").collect() printRows(rows) sc.stop()
from pyspark import SparkConf from dip.spark import SparkContext conf = SparkConf().setAppName("spark_textFiles_test") sc = SparkContext(conf=conf) dirs = ["hdfs://dip.cdh5.dev:8020/user/yurun/dir1", "hdfs://dip.cdh5.dev:8020/user/yurun/dir2"] def printLines(lines): if lines: for line in lines: print line lines = sc.textFiles(dirs).collect() printLines(lines) sc.stop()