#cluster_seeds = ['199.60.17.32'] #for loading to cluster, in any case cluster_seeds = ['127.0.0.1'] spark = SparkSession.builder.appName('Data going to Cassandra').config('spark.cassandra.connection.host', ','.join(cluster_seeds)).getOrCreate() assert spark.version>='2.4' spark.sparkContext.setLogLevel('WARN') sc = spark.sparkContext spark.conf.set("spark.sql.session.timeZone", "UTC") craigslist_schema = types.StructType([ types.StructField('posted',types.TimestampType()), types.StructField('region',types.StringType()), types.StructField('postingid',types.StringType()), types.StructField('image',types.StringType()), types.StructField('url',types.StringType()), types.StructField('labels',types.ArrayType(types.StringType())), types.StructField('beds',types.FloatType()), types.StructField('baths',types.FloatType()), types.StructField('city',types.StringType()), types.StructField('latitude',types.FloatType()), types.StructField('longitude',types.FloatType()), types.StructField('title', types.StringType()), types.StructField('price',types.FloatType()), ]) def transform(input_json): # labels - convert to lower and store as list label_arr=[] for key in input_json['labels']: label_arr.append(key.lower()) input_json['labels']=label_arr
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ comments_schema = types.StructType([ types.StructField('archived', types.BooleanType()), types.StructField('author', types.StringType()), types.StructField('author_flair_css_class', types.StringType()), types.StructField('author_flair_text', types.StringType()), types.StructField('body', types.StringType()), types.StructField('controversiality', types.LongType()), types.StructField('created_utc', types.StringType()), types.StructField('distinguished', types.StringType()), types.StructField('downs', types.LongType()), types.StructField('edited', types.StringType()), types.StructField('gilded', types.LongType()), types.StructField('id', types.StringType()), types.StructField('link_id', types.StringType()), types.StructField('name', types.StringType()), types.StructField('parent_id', types.StringType()), types.StructField('retrieved_on', types.LongType()), types.StructField('score', types.LongType()), types.StructField('score_hidden', types.BooleanType()), types.StructField('subreddit', types.StringType()), types.StructField('subreddit_id', types.StringType()), types.StructField('ups', types.LongType()), #types.StructField('year', types.IntegerType()), #types.StructField('month', types.IntegerType()), ]) def main(in_directory, out_directory):
assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ schema = types.StructType([ # commented-out fields won't be read #types.StructField('archived', types.BooleanType(), False), #types.StructField('author', types.StringType(), False), #types.StructField('author_flair_css_class', types.StringType(), False), #types.StructField('author_flair_text', types.StringType(), False), #types.StructField('body', types.StringType(), False), #types.StructField('controversiality', types.LongType(), False), #types.StructField('created_utc', types.StringType(), False), #types.StructField('distinguished', types.StringType(), False), #types.StructField('downs', types.LongType(), False), #types.StructField('edited', types.StringType(), False), #types.StructField('gilded', types.LongType(), False), #types.StructField('id', types.StringType(), False), #types.StructField('link_id', types.StringType(), False), #types.StructField('name', types.StringType(), False), #types.StructField('parent_id', types.StringType(), True), #types.StructField('retrieved_on', types.LongType(), False), types.StructField('score', types.LongType(), False), #types.StructField('score_hidden', types.BooleanType(), False), types.StructField('subreddit', types.StringType(), False), #types.StructField('subreddit_id', types.StringType(), False), #types.StructField('ups', types.LongType(), False), ]) def main(in_directory, out_directory): comments = spark.read.json(in_directory, schema=schema)
import pyspark.sql.functions as psf import pyspark.sql.types as pst from streaming.spark import get_spark_context, enable_auto_compact, DELTA_FORMAT schema = (pst.StructType().add("match_id", pst.IntegerType()).add( "price", pst.DoubleType()).add("ts", pst.TimestampType()).add( "score", pst.ArrayType(pst.IntegerType()))) spark = get_spark_context("consumePrices") # enable_auto_compact(spark) (spark.readStream.format("kafka").option( "kafka.bootstrap.servers", "localhost:9092").option("subscribe", "prices").option( "startingOffsets", "earliest").load().selectExpr( "CAST(key AS STRING)", "CAST(value AS STRING)").withColumn( "value", psf.from_json("value", schema)).selectExpr( "value.match_id AS match_id", "value.price AS price", "value.ts AS ts", "value.score[0] AS home_score", "value.score[1] AS away_score", ).withColumn("exec_date", psf.to_date("ts")).writeStream.format( "delta").outputMode("append").option( "checkpointLocation", "./_checkpoints/streaming") # .option("mergeSchema", "true") .partitionBy("exec_date") # .trigger(processingTime='5 minute')
# MAGIC Print the contents of `README.txt`. # COMMAND ---------- print(dbutils.fs.head('dbfs:/movielens/README.txt')) # COMMAND ---------- # MAGIC %md # MAGIC Load movies from `movies.csv`. # COMMAND ---------- MovieType = T.StructType([ T.StructField('movieId', T.IntegerType()), T.StructField('title', T.StringType()), T.StructField('genres', T.StringType()), ]) movies = (spark.read.option('header', True).csv(movielensLocation + 'movies.csv', schema=MovieType)) # COMMAND ---------- display(movies) # COMMAND ---------- # MAGIC %md # MAGIC Set `genres` to missing when equal to '(no genres listed)', otherwise split into a string array.
from pyspark.sql import SparkSession, functions, types from io import * import csv import pandas as pd from urllib.request import * import getCodeSets as codesets spark = SparkSession.builder.appName('Load Mortage Data').getOrCreate() mortage_schema = types.StructType([ types.StructField('date', types.StringType(), True), types.StructField('1y_fixed_posted', types.StringType(), True), types.StructField('2y_bond', types.StringType(), True), types.StructField('3y_bond', types.StringType(), True), types.StructField('3y_fixed_posted', types.StringType(), True), types.StructField('5y_bond', types.StringType(), True), types.StructField('5y_fixed_posted', types.StringType(), True), types.StructField('7y_bond', types.StringType(), True), types.StructField('10y_bond', types.StringType(), True), types.StructField('bank', types.StringType(), True), types.StructField('overnight', types.StringType(), True), types.StructField('overnight_target', types.StringType(), True), types.StructField('prime', types.StringType(), True), ]) def loadMortageInfo(): mortage = spark.read.csv( "Other_sources/mortgage rate since 1935.csv", schema=mortage_schema).createOrReplaceTempView("mortage") transf_year_month = spark.sql( "SELECT *, substr(m.date, 1, instr(m.date, '-') +2) as year_month FROM mortage m "
from pyspark.sql.window import Window from pyspark.sql import Row from pyspark.sql.functions import pandas_udf, PandasUDFType import pandas as pd import numpy as np from numba import vectorize, jit, njit, prange, cuda from numba import float64 as numba_float64 DataPoint = collections.namedtuple( "DataPoint", ["id", "grp", "subgrp", "A", "B", "C", "D", "E", "F"]) DataPointSchema = DataTypes.StructType([ DataTypes.StructField('id', DataTypes.LongType(), False), DataTypes.StructField('grp', DataTypes.LongType(), False), DataTypes.StructField('subgrp', DataTypes.LongType(), False), DataTypes.StructField('A', DataTypes.LongType(), False), DataTypes.StructField('B', DataTypes.LongType(), False), DataTypes.StructField('C', DataTypes.DoubleType(), False), DataTypes.StructField('D', DataTypes.DoubleType(), False), DataTypes.StructField('E', DataTypes.DoubleType(), False), DataTypes.StructField('F', DataTypes.DoubleType(), False) ]) def generateData(numGrp1=3, numGrp2=3, repetition=1000): return [ DataPoint(id=i, grp=(i // numGrp2) % numGrp1, subgrp=i % numGrp2, A=random.randint(1, repetition), B=random.randint(1, repetition), C=random.uniform(1, 10),
import sys assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ import psycopg2 from pyspark.sql import SparkSession, functions as sf, types spark = SparkSession.builder.appName('Cycle Data Load').config( 'spark.driver.extraClassPath', 'postgresql-42.2.8.jar').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert spark.version >= '2.4' # make sure we have Spark 2.4+ cycles = types.StructType([ types.StructField('Rental Id', types.IntegerType()), types.StructField('Duration', types.IntegerType()), types.StructField('Bike Id', types.IntegerType()), types.StructField('End Date', types.StringType()), types.StructField('EndStation Id', types.IntegerType()), types.StructField('EndStation Name', types.StringType()), types.StructField('Start Date', types.StringType()), types.StructField('StartStation Id', types.IntegerType()), types.StructField('StartStation Name', types.StringType()), ]) remfiles2019 = spark.read.option( "header", "true").schema(cycles).csv("data/cycling/rem2019/*.csv") allfiles2016 = spark.read.option( "header", "true").schema(cycles).csv("data/cycling/2016TripDataZip/*.csv") allfiles2015 = spark.read.option( "header", "true").schema(cycles).csv("data/cycling/2015TripDataZip/*.csv") allfiles2014 = spark.read.option( "header", "true").schema(cycles).csv("data/cycling/cyclehireusagestats-2014/*.csv") allfiles2013 = spark.read.option(
import sys from pyspark.sql import SparkSession, functions, types # from pyspark.sql.functions import col spark = SparkSession.builder.appName('wikipedia popular').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ schema1 = types.StructType([ types.StructField('lang', types.StringType()), types.StructField('content', types.StringType()), types.StructField('times', types.IntegerType()), types.StructField('bytes', types.IntegerType()), ]) def input_file_name(in_directory): return spark.read.csv(in_directory, schema=schema1, sep=' ').withColumn('filename', functions.input_file_name()) # udf: user defined function # cool cool cool. spent 5 mins to calculate the scope of the string length def path_to_hour(path): filename = path.split('/')[-1] return filename[11:22]
from pyspark import SparkContext from pyspark.sql import SparkSession, types sparkSess = SparkSession.builder.appName('badges').getOrCreate() sc = sparkSess.sparkContext bdschema = types.StructType([ types.StructField('id', types.IntegerType()), types.StructField('name', types.StringType()), types.StructField('date', types.StringType()), types.StructField('user_id', types.IntegerType()), types.StructField('class', types.IntegerType()), types.StructField('tag_based', types.BooleanType()) ]) sbad = sparkSess.read.format("s3selectCSV").schema(bdschema).options( header="true").load("s3://bigdata-4/badges.csv").select("id", "name") sbad.write.mode("append").parquet("s3://bigdata-4/badges/")
def __init__(self, tpe): # Seems we cannot specify field names. I currently gave some default names # `c0, c1, ... cn`. self.tpe = types.StructType([ types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe)) ]) # type: types.StructType
def get_dataset( dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True, table_name=None, sqlite_db_path=None, ): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == "PandasDataset": if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": if not create_engine: return None if sqlite_db_path is not None: engine = create_engine(f"sqlite:////{sqlite_db_path}") else: engine = create_engine("sqlite://") conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if (schemas and "sqlite" in schemas and isinstance(engine.dialect, sqlitetypes.dialect)): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "postgresql": if not create_engine: return None # Create a new database engine = create_engine("postgresql://postgres@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect)): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mysql": if not create_engine: return None engine = create_engine("mysql+pymysql://root@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "mysql" in schemas and isinstance(engine.dialect, mysqltypes.dialect)): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mssql": if not create_engine: return None engine = create_engine( "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true", # echo=True, ) # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit" # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful). # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True)) conn = engine.connect() sql_dtypes = {} if (schemas and dataset_type in schemas and isinstance(engine.dialect, mssqltypes.dialect)): schema = schemas[dataset_type] sql_dtypes = { col: MSSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "SparkDFDataset": from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType, } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and "spark" in schemas: schema = schemas["spark"] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
pathFile = "D:/douyinData/train_1w.txt" # pathFile="D:/douyinData/final_track2_train.txt" rawRdd_train = sc.textFile(pathFile).map(lambda line: line.split('\t')) print('finish read rdd, start to init action log rdd:') actionLogRdd_train = rawRdd_train.map(lambda x: (int(x[0]), int(x[1]), int(x[ 2]), int(x[3]), int(x[4]), int(x[5]), int(x[6]), int(x[7]), int(x[ 8]), int(x[9]), int(x[10]), int(x[11]))) sqlContext = SQLContext(sc) labels = [('uid', typ.IntegerType()), ('user_city', typ.IntegerType()), ('item_id', typ.IntegerType()), ('author_id', typ.IntegerType()), ('item_city', typ.IntegerType()), ('channel', typ.IntegerType()), ('finish', typ.IntegerType()), ('like', typ.IntegerType()), ('music_id', typ.IntegerType()), ('device', typ.IntegerType()), ('time', typ.LongType()), ('duration_time', typ.IntegerType())] actionLogSchema = typ.StructType( [typ.StructField(e[0], e[1], True) for e in labels]) df_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema) feature_group = ['uid', 'author_id'] df_tmp = df_train.select(feature_group) df1 = df_tmp.groupby(feature_group).count() df1.show(5) df2 = df_tmp.groupby(feature_group[0]).count().withColumnRenamed( 'count', feature_group[0] + '_count') df2.show(5) df1 = df1.join(df2, feature_group[0], 'left') df1.show(5) df1 = df1.withColumn( feature_group[1] + '_' + feature_group[0] + "_condition_ratio",
def spark_type(self) -> types.StructType: return types.StructType([field.struct_field for field in self.fields])
.getOrCreate() spark_session.sparkContext.addFile('parse_tool.py') from parse_tool import parse_logs, parse_geoinfo # User logs collection user_logs = spark_session.sparkContext.textFile("/data/access_logs/big_log/") parsed_logs = user_logs.map(parse_logs)\ .map(lambda parse_res : [ parse_res[0], parse_res[0] + parse_res[7] ]) schema = tp.StructType().add("ip", tp.StringType())\ .add("user_id", tp.StringType()) user_log_df = spark_session.createDataFrame(parsed_logs, schema) # Geo info collection geoip = spark_session.sparkContext.textFile("/data/access_logs/geoiplookup/") parsed_geoip = geoip.map(parse_geoinfo)\ .map(lambda parse_res : [ parse_res[0], parse_res[1] ]) schema = tp.StructType().add("ip", tp.StringType())\ .add("location", tp.StringType())
import math import re import sys assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types, Row spark = SparkSession.builder.appName('example').getOrCreate() sc = spark.sparkContext log_schema = types.StructType([ types.StructField('hostname', types.StringType(), False), types.StructField('num_bytes', types.IntegerType(), False), ]) def get_row(line): line_dissemble = re.compile( r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$' ) match = re.search(line_dissemble, line) if match: m = re.match(line_dissemble, line) host = m.group(1) bytes = int(m.group(4)) row = Row(host, bytes) return row return None def create_row_rdd(in_directory):
storage_account_name = "" storage_account_access_key = "" container = "" # Set configuration to allow acces to the blob storage inside the storage account file_location = f"wasbs://{container}@{storage_account_name}.blob.core.windows.net/" file_type = "csv" spark.conf.set( "fs.azure.account.key."+storage_account_name+".blob.core.windows.net", storage_account_access_key) # Define schema and retrieve data from the blob storage schema = t.StructType() \ .add("time", t.StringType(), True) \ .add("open", t.DoubleType(), True) \ .add("close", t.DoubleType(), True) \ .add("high", t.DoubleType(), True) \ .add("low", t.DoubleType(), True) \ .add("volume", t.DoubleType(), True) \ .add("input_file", t.StringType(), True) df = spark.read.format(file_type).options(header="true",inferSchema="true").schema(schema).load(file_location).withColumn("input_file", input_file_name()) # Get and split file name to create a column with the coin pair corresponding for each row split_col = split(df['input_file'], '/') df = df.withColumn('coin_pair', split(split_col.getItem(3),'\.').getItem(0)) df = df.drop("input_file") # We have a timestamp and we want a date df = df.withColumn('Date', from_unixtime((col('time')/1000))) # Agregate data to have a daily result, ready to insert into the database df = df.groupBy("coin_pair",window("Date","1 day")) \
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('first Spark app').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ schema = types.StructType([ types.StructField('id', types.IntegerType()), types.StructField('x', types.FloatType()), types.StructField('y', types.FloatType()), types.StructField('z', types.FloatType()), ]) def main(in_directory, out_directory): # Read the data from the JSON files xyz = spark.read.json(in_directory, schema=schema) #xyz.show(); return # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by. with_bins = xyz.select( xyz['x'], # TODO: also the y values xyz['y'], (xyz['id'] % 10).alias('bin'), ) #with_bins.show(); return
from pyspark.sql import SQLContext from pyspark import SparkContext from pyspark.sql import SparkSession import pandas as pd sc =sc = SparkContext.getOrCreate() spark =SparkSession.builder.getOrCreate() sqlContext = SQLContext(sc) schema = sparksqltypes.StructType([ sparksqltypes.StructField("PassengerId", sparksqltypes.DoubleType(), True), sparksqltypes.StructField("Survived", sparksqltypes.DoubleType(), True), sparksqltypes.StructField("Pclass", sparksqltypes.DoubleType(), True), sparksqltypes.StructField("Name", sparksqltypes.StringType(), True), sparksqltypes.StructField("Sex", sparksqltypes.StringType(), True), sparksqltypes.StructField("Age", sparksqltypes.DoubleType(), True), sparksqltypes.StructField("SibSp", sparksqltypes.DoubleType(), True), sparksqltypes.StructField("Parch", sparksqltypes.DoubleType(), True), sparksqltypes.StructField("Ticket", sparksqltypes.StringType(), True), sparksqltypes.StructField("Fare", sparksqltypes.DoubleType(), True), sparksqltypes.StructField("Cabin", sparksqltypes.StringType(), True), sparksqltypes.StructField("Embarked", sparksqltypes.StringType(), True)]) titanic = spark.read.csv('file:///C:/Users/Thibaut/Documents/ML/titanic_pyspark/titanic.csv',schema, header=True) # ---------------------------------------------------------------------------------------------------------- def my_compute_function(titanic): # first step = feature engineering titanic = feature_engineering(titanic)
log_data_path = os.path.join(config["DATA"]["input_path"], "log_data/*.json") output_path = config["DATA"]["output_path"] tables = ["songs", "artists", "users", "time", "songplays"] # Schema for log_data and song_data schema = { "log_data": T.StructType() \ .add("artist", T.StringType()) .add("auth", T.StringType()) .add("firstName", T.StringType()) .add("gender", T.StringType()) .add("itemInSession", T.IntegerType()) .add("lastName", T.StringType()) .add("length", T.FloatType()) .add("level", T.StringType()) .add("location", T.StringType()) .add("method", T.StringType()) .add("page", T.StringType()) .add("registration", T.FloatType()) .add("sessionId", T.IntegerType()) .add("song", T.StringType()) .add("status", T.IntegerType()) .add("ts", T.StringType()) .add("userAgent", T.StringType()) .add("userId", T.StringType()), "song_data": T.StructType() \ .add("artist_id", T.StringType()) .add("artist_latitude", T.FloatType()) .add("artist_location", T.StringType()) .add("artist_longitude", T.FloatType()) .add("artist_name", T.StringType())
from pyspark.sql import SparkSession, types spark = SparkSession.builder.appName('Train Data Analysis').config( 'spark.driver.extraClassPath', 'postgresql-42.2.8.jar').getOrCreate() assert spark.version >= '2.4' # make sure we have Spark 2.4+ spark.sparkContext.setLogLevel('WARN') sc = spark.sparkContext schema_counts = types.StructType([ types.StructField('locationcode', types.IntegerType()), types.StructField('station', types.StringType()), types.StructField('borough', types.StringType()), types.StructField('note', types.StringType()), types.StructField('entryweekday', types.LongType()), types.StructField('entrysaturday', types.LongType()), types.StructField('entrysunday', types.LongType()), types.StructField('exitweekday', types.LongType()), types.StructField('exitsaturday', types.LongType()), types.StructField('exitsunday', types.LongType()), types.StructField('entryexitinmillion', types.DoubleType()), ]) def main(): counts() def counts(): file_counts_17 = '/home/anuj/Desktop/732project/data/counts/2017entryexit.csv' data_counts_17 = spark.read.csv(file_counts_17,
import sys import re import numpy as np from pyspark.sql import SparkSession, functions, types, Row spark = SparkSession.builder.appName('read txt').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ schema = types.StructType([ # commented-out fields won't be read types.StructField('r', types.IntegerType(), False), types.StructField('g', types.IntegerType(), False), types.StructField('b', types.IntegerType(), False), ]) def some_function(path): return (path[0:11]) def split_func(string): return [int(x) for x in string.split(",")] path_to_hour = functions.udf(some_function, returnType=types.StringType()) def main(in_directory, out_directory): ### sc = spark.sparkContext
('INFANT_ASSIST_VENTI_6HRS', typ.StringType()), ('INFANT_NICU_ADMISSION', typ.StringType()), ('INFANT_SURFACANT', typ.StringType()), ('INFANT_ANTIBIOTICS', typ.StringType()), ('INFANT_SEIZURES', typ.StringType()), ('INFANT_NO_ABNORMALITIES', typ.StringType()), ('INFANT_ANCEPHALY', typ.StringType()), ('INFANT_MENINGOMYELOCELE', typ.StringType()), ('INFANT_LIMB_REDUCTION', typ.StringType()), ('INFANT_DOWN_SYNDROME', typ.StringType()), ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', typ.StringType()), ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', typ.StringType()), ('INFANT_BREASTFED', typ.StringType()) ] schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels]) # spark = SparkSession.builder.config('spark.debug.maxToStringFields', '100').config('spark.io.compression.codec', 'snappy').appName("test").getOrCreate(); # births = spark.read.csv(datafile,header=True,schema=schema,encoding='ISO-8859-1') #绑定的schema spark = SparkSession.builder.config( 'spark.debug.maxToStringFields', '100').config('spark.io.compression.codec', 'snappy').appName("test").getOrCreate() #读取方法 # births=spark.read.format("csv").\ # option("header","true")\ # .load("births_train.csv") births = spark.read.csv(datafile, header=True, schema=schema)
# COMMAND ---------- nrmDssSchema = types.StructType([ types.StructField("country", types.StringType()), types.StructField("calendar_year", types.StringType()), types.StructField("calendar_month", types.StringType()), types.StructField("calendar_day", types.StringType()), types.StructField("distributor", types.StringType()), types.StructField("site", types.StringType()), types.StructField("outlet", types.StringType()), types.StructField("billing_document", types.StringType()), types.StructField("billing_type", types.StringType()), types.StructField("billing_item", types.StringType()), types.StructField("product", types.StringType()), types.StructField("promotion_id", types.StringType()), types.StructField("promotion_desc1", types.StringType()), types.StructField("promo_start_date", types.StringType()), types.StructField("promo_end_date", types.StringType()), types.StructField("promotion_type", types.StringType()), types.StructField("value_based_promo_disc", types.DoubleType()), types.StructField("header_lvl_disc", types.DoubleType()), types.StructField("free_qty_in_cs", types.DoubleType()), types.StructField("free_qty_in_pc", types.DoubleType()), types.StructField("free_qty_val_in_cs", types.DoubleType()), types.StructField("free_qty_val_in_pc", types.DoubleType()), types.StructField("free_qty_retail_price_pc", types.DoubleType()), types.StructField("free_qty_retail_price_cs", types.DoubleType()) ]) nrmRawDTRDISDF = spark.createDataFrame(nrmRawDTRDIS, schema=nrmDssSchema) nrmRawDTRDISDF.createOrReplaceTempView("raw_nrm_data_dtrdis")
import sys import re from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('reddit averages').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ wiki_schema = types.StructType([ types.StructField('language', types.StringType()), types.StructField('title', types.StringType()), types.StructField('request', types.IntegerType()), types.StructField('bytes', types.IntegerType()) ]) def find_path(path): return re.search("\d{8}-\d{2}",path)[0] def main(in_directory, out_directory): data = spark.read.csv(in_directory, schema=wiki_schema, sep = ' ' ).withColumn('filename', functions.input_file_name()) data = data.filter(data['language']=='en') data = data.filter(data['title']!='Main_Page') data = data.drop(data['title'].startswith('Special:')) path_to_hour = functions.udf(find_path, returnType=types.StringType()) data = data.withColumn('date', path_to_hour(data['filename'])).cache() # data = data.cache() group_data = data.groupby('date').max('request')
from io import * import pandas as pd from urllib.request import * spark = SparkSession.builder.appName('Load Crime Data').getOrCreate() #Schema for Consumer Price Index crime_schema = types.StructType([ types.StructField('REF_DATE', types.StringType(), True), types.StructField('GEO', types.StringType(), True), types.StructField('DGUID', types.StringType(), True), types.StructField('Violations', types.StringType(), True), types.StructField('Statistics', types.StringType(), True), types.StructField('UOM', types.StringType(), True), types.StructField('UOM_ID', types.StringType(), True), types.StructField('SCALAR_FACTOR', types.StringType(), True), types.StructField('SCALAR_ID', types.StringType(), True), types.StructField('VECTOR', types.StringType(), True), types.StructField('COORDINATE', types.StringType(), True), types.StructField('VALUE', types.StringType(), True), types.StructField('STATUS', types.StringType(), True), types.StructField('SYMBOL', types.StringType(), True), types.StructField('TERMINATE', types.StringType(), True), types.StructField('DECIMALS', types.StringType(), True), ]) ''' * Description: This method is used to download and extract the zip file contents in memory. * input: String -> url of response. * output: -> Panda DataFrame -> file contents. '''
import random from pyspark.sql import SparkSession, DataFrame from pyspark.sql import types as st output_path = "data1.csv" spark = SparkSession.builder.master("local").getOrCreate() generator = random.Random() generator.seed(2077) schema = st.StructType([ st.StructField("user", st.StringType(), True), st.StructField("value", st.IntegerType(), True), st.StructField("time", st.IntegerType(), True), ]) data = [( generator.choice(["a", "b", "c", "d"]), generator.randint(0, 100), generator.randint(0, 1000), ) for _ in range(0, 100)] dataframe = spark.createDataFrame(data, schema) dataframe.write.mode("overwrite").csv(output_path, header=True)
spark = SparkSession.builder.appName('Grape Resilience').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.4' # make sure we have Spark 2.4+ data_schema = types.StructType([ types.StructField("FullName", types.StringType()), # Winery + wine name + year types.StructField("Winery", types.StringType()), types.StructField("WineName", types.StringType()), types.StructField("Year", types.IntegerType()), types.StructField("Region", types.StringType()), types.StructField("RegionalVariety", types.StringType()), # Varietal? types.StructField("VintageRating", types.FloatType()), # Average rating for vintage types.StructField("VintageRatingCount", types.IntegerType()), types.StructField("WineRating", types.FloatType()), # Average rating across vintages types.StructField("WineRatingCount", types.IntegerType()), types.StructField("VintagePrice", types.FloatType()), # Same as below types.StructField("WinePrice", types.FloatType()), # GBP/750ml types.StructField("VintageRatingPrice", types.FloatType()), # rating/price types.StructField("WineRatingPrice", types.FloatType()) # rating/price ]) def main(): data = spark.read.csv("white-wine-price-rating.csv", header=True, schema=data_schema)
cluster_seeds = ['199.60.17.171', '199.60.17.188'] cluster_seeds = ['199.60.17.171', '199.60.17.188'] conf = SparkConf().setAppName('example code') \ .set('spark.cassandra.connection.host', ','.join(cluster_seeds)) spark = SparkSession.builder.appName('Big Data Project').getOrCreate() sc = spark.sparkContext assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.2' # make sure we have Spark 2.2+ schema = types.StructType([ types.StructField('state_code', types.IntegerType(), True), types.StructField('month', types.IntegerType(), True), types.StructField('year', types.IntegerType(), True), types.StructField('am_rh', types.DoubleType(), True) ]) train_final = spark.createDataFrame(sc.emptyRDD(), schema=schema) for year in range(2013, 2018): support = spark.read.csv( "/home/ldua/Desktop/BigDataProject/support/daily_RH_DP_" + str(year) + ".csv", header=True) support_f = support.select('State Code', 'Date Local', 'Arithmetic Mean') split_col = functions.split(support_f['Date Local'], '-') support_f = support_f.withColumn('Year', split_col.getItem(0)) support_f = support_f.withColumn('Month', split_col.getItem(1))
def data_describe(self): print('start to read data for rdd:') rawRdd_face = self.read_rdd('track2_face_attrs.txt').map(lambda line : json.loads(line)) # rawRdd_face.cache() global keys keys=['item_id','gender','beauty','relative_position'] rawRdd_face2=rawRdd_face.map(lambda dic:{key :jsonpath.jsonpath(dic,'$..'+key)[0] if jsonpath.jsonpath(dic,'$..'+key) else None for key in keys}) print(rawRdd_face2.take(10)) #转化为dataframe,在不指定schema的情况下会自动推断 sqlContext = SQLContext(self.sc) labels=[ ('item_id',typ.IntegerType()), ('gender',typ.IntegerType()), ('beauty',typ.FloatType()), ('relative_position',typ.ArrayType(typ.FloatType()))] Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) df = sqlContext.createDataFrame(rawRdd_face2,Schema) attrs = self.sc.parallelize(["relative_position_" + str(i) for i in range(4)]).zipWithIndex().collect() print("列名:", attrs) for name, index in attrs: df = df.withColumn(name, fn.bround(df['relative_position'].getItem(index), scale=3)) #删除 relative_position df_face =df.drop('relative_position') del df gc.collect() # print('-------保存df_face数据-------') # file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'face_feature' # os.system("hadoop fs -rm -r {}".format(file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 # df_face.rdd.map(tuple).saveAsPickleFile(file_path) # print('数据保存结束') print('start to read act data only for uid and item_id :') rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t')) rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t')) actionLogRdd_train = rawRdd_train.map( lambda x :(int(x[0]), int(x[2]))) # total = actionLogRdd_train.count() # print('total: ' + str(total)) actionLogRdd_test = rawRdd_test.map( lambda x :(int(x[0]), int(x[2]))) sqlContext = SQLContext(self.sc) labels=[('uid',typ.IntegerType()), ('item_id',typ.IntegerType()) ] actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema) dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema) #根据item_id进行关联 df_face=df_face.select(["item_id","gender","beauty"]) df_uid_face_test=dfactionLog_test.select(["uid","item_id"]).join(df_face,'item_id','left').drop("item_id") df_uid_face_train=dfactionLog_train.select(["uid","item_id"]).join(df_face,'item_id','left').drop("item_id") del dfactionLog_test del dfactionLog_train gc.collect() #只对训练集中的uid进行处理 gdf=df_uid_face_train.groupby("uid") df1=gdf.agg(fn.max("beauty").alias("uid_max_beauty"),fn.bround(fn.avg("beauty"),scale=3).alias("uid_avg_beauty"),fn.bround((fn.sum("gender")/fn.count("gender")),scale=3).alias("uid_male_ratio")) df1.show(1,truncate=False) #最终只保留df1即可 print('-------保存uid_face数据-------') file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'uid_face_train' os.system("hadoop fs -rm -r {}".format(file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df1.rdd.map(tuple).saveAsPickleFile(file_path) print('数据保存结束')