def sqlType(self): return ArrayType(DoubleType(), False)
F.udf(str2datestr)( F.col('dt'))) #.cast(DateType())) data_df = logs_df.select(useful_columns) data_df = preprocessing(data_df) data_df.write.csv('../data/round1_train/logs_66.csv', header=True, sep=',', mode='overwrite') else: data_df = spark.read.csv('../data/round1_train/logs_66.csv', header=True) smart_columns = [ column for column in data_df.columns if 'smart' in column ] + ['anomaly_sum'] for col in smart_columns: data_df = data_df.withColumn(col, F.col(col).cast(DoubleType())) #data_df=data_df.withColumn('dt',F.col('dt').cast(DateType())) # feature cross data_df = data_df.withColumn('smart_4raw', F.col('smart_4raw') / 12) data_df = data_df.withColumn('smart_5raw', F.col('smart_5raw') / 16) data_df = data_df.withColumn('smart_191raw', F.col('smart_191raw') / 18) data_df = data_df.withColumn('smart_198raw', F.col('smart_198raw') / 18) data_df = data_df.withColumn('smart_197raw', F.col('smart_197raw') / 18) data_df = data_df.withColumn('smart_187raw', F.col('smart_187raw') / 15) cross_columns = [ 'smart_4raw', 'smart_5raw', 'smart_187raw', 'smart_191raw', 'smart_197raw', 'smart_198raw' ] for i in range(len(cross_columns)):
def align_diff_frames( resolve_func, this: "DataFrame", that: "DataFrame", fillna: bool = True, how: str = "full", preserve_order_column: bool = False, ) -> "DataFrame": """ This method aligns two different DataFrames with a given `func`. Columns are resolved and handled within the given `func`. To use this, `compute.ops_on_diff_frames` should be True, for now. :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and the column of another DataFrame. It returns an iterable that produces Series. >>> from pyspark.pandas.config import set_option, reset_option >>> >>> set_option("compute.ops_on_diff_frames", True) >>> >>> kdf1 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> kdf2 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> >>> def func(kdf, this_column_labels, that_column_labels): ... kdf # conceptually this is A + B. ... ... # Within this function, Series from A or B can be performed against `kdf`. ... this_label = this_column_labels[0] # this is ('a',) from kdf1. ... that_label = that_column_labels[0] # this is ('a',) from kdf2. ... new_series = (kdf[this_label] - kdf[that_label]).rename(str(this_label)) ... ... # This new series will be placed in new DataFrame. ... yield (new_series, this_label) >>> >>> >>> align_diff_frames(func, kdf1, kdf2).sort_index() a 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 >>> reset_option("compute.ops_on_diff_frames") :param this: a DataFrame to align :param that: another DataFrame to align :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`. Otherwise, it returns as are. :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict. - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and 'that_columns' in this function are B, C and B, C. - left: `resolve_func` should resolve columns including that columns. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is B, C but `that_columns` are B, C, D. - inner: Same as 'full' mode; however, internally performs inner join instead. :return: Aligned DataFrame """ from pyspark.pandas.frame import DataFrame assert how == "full" or how == "left" or how == "inner" this_column_labels = this._internal.column_labels that_column_labels = that._internal.column_labels common_column_labels = set(this_column_labels).intersection( that_column_labels) # 1. Perform the join given two dataframes. combined = combine_frames(this, that, how=how, preserve_order_column=preserve_order_column) # 2. Apply the given function to transform the columns in a batch and keep the new columns. combined_column_labels = combined._internal.column_labels that_columns_to_apply = [] this_columns_to_apply = [] additional_that_columns = [] columns_to_keep = [] column_labels_to_keep = [] for combined_label in combined_column_labels: for common_label in common_column_labels: if combined_label == tuple(["this", *common_label]): this_columns_to_apply.append(combined_label) break elif combined_label == tuple(["that", *common_label]): that_columns_to_apply.append(combined_label) break else: if how == "left" and combined_label in [ tuple(["that", *label]) for label in that_column_labels ]: # In this case, we will drop `that_columns` in `columns_to_keep` but passes # it later to `func`. `func` should resolve it. # Note that adding this into a separate list (`additional_that_columns`) # is intentional so that `this_columns` and `that_columns` can be paired. additional_that_columns.append(combined_label) elif fillna: columns_to_keep.append( F.lit(None).cast(DoubleType()).alias(str(combined_label))) column_labels_to_keep.append(combined_label) else: columns_to_keep.append(combined._kser_for(combined_label)) column_labels_to_keep.append(combined_label) that_columns_to_apply += additional_that_columns # Should extract columns to apply and do it in a batch in case # it adds new columns for example. if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0: kser_set, column_labels_applied = zip(*resolve_func( combined, this_columns_to_apply, that_columns_to_apply)) columns_applied = list(kser_set) column_labels_applied = list(column_labels_applied) else: columns_applied = [] column_labels_applied = [] applied = DataFrame( combined._internal.with_new_columns( columns_applied + columns_to_keep, column_labels=column_labels_applied + column_labels_to_keep, )) # type: DataFrame # 3. Restore the names back and deduplicate columns. this_labels = OrderedDict() # Add columns in an order of its original frame. for this_label in this_column_labels: for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels and this_label == new_label[1:]: this_labels[new_label[1:]] = new_label # After that, we will add the rest columns. other_labels = OrderedDict() for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels: other_labels[new_label[1:]] = new_label kdf = applied[list(this_labels.values()) + list(other_labels.values())] kdf.columns = kdf.columns.droplevel() return kdf
def test_supported_types(self): values = [ 1, 2, 3, 4, 5, 1.1, 2.2, Decimal(1.123), [1, 2, 2], True, 'hello', bytearray([0x01, 0x02]) ] output_fields = [('id', IntegerType()), ('byte', ByteType()), ('short', ShortType()), ('int', IntegerType()), ('long', LongType()), ('float', FloatType()), ('double', DoubleType()), ('decim', DecimalType(10, 3)), ('array', ArrayType(IntegerType())), ('bool', BooleanType()), ('str', StringType()), ('bin', BinaryType())] output_schema = StructType([StructField(*x) for x in output_fields]) df = self.spark.createDataFrame([values], schema=output_schema) # Different forms of group map pandas UDF, results of these are the same udf1 = pandas_udf( lambda pdf: pdf.assign(byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin), output_schema, PandasUDFType.GROUPED_MAP) udf2 = pandas_udf( lambda _, pdf: pdf.assign(byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin), output_schema, PandasUDFType.GROUPED_MAP) udf3 = pandas_udf( lambda key, pdf: pdf.assign(id=key[0], byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin), output_schema, PandasUDFType.GROUPED_MAP) result1 = df.groupby('id').apply(udf1).sort('id').toPandas() expected1 = df.toPandas().groupby('id').apply( udf1.func).reset_index(drop=True) result2 = df.groupby('id').apply(udf2).sort('id').toPandas() expected2 = expected1 result3 = df.groupby('id').apply(udf3).sort('id').toPandas() expected3 = expected1 assert_frame_equal(expected1, result1) assert_frame_equal(expected2, result2) assert_frame_equal(expected3, result3)
def read_data(self): userSchema = StructType([ StructField('medallion', StringType()), StructField('pickup_time', TimestampType()), StructField('total_amount', DoubleType()), ]) self.fare = self.spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "nycfare1") \ .option("startingOffsets", "earliest") \ .option('failOnDataLoss','false') \ .option("maxOffsetsPerTrigger", 1000) \ .load() self.df_fare = self.fare.selectExpr("CAST(value as STRING) as json") \ .select(from_json("json", userSchema).alias('data'))\ .selectExpr( "data.medallion as medallion_fare", "cast (data.pickup_time as timestamp) as pickup_time_fare", "cast (data.total_amount as float)", ) userSchema = StructType([ StructField('medallion', StringType()), StructField('pickup_time', TimestampType()), StructField('dropoff_time', TimestampType()), StructField('passenger_count', IntegerType()), StructField('trip_time', IntegerType()), StructField('trip_distance', DoubleType()), StructField('pickup_loc', MapType(StringType(), DoubleType())), StructField('dropoff_loc', MapType(StringType(), DoubleType())) ]) self.trip = self.spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "nycspeed9") \ .option("startingOffsets", "earliest") \ .option('failOnDataLoss', 'false') \ .option("maxOffsetsPerTrigger", 1000) \ .load() self.df_trip = self.trip.selectExpr("CAST(value as STRING) as json") \ .select(from_json("json", userSchema).alias('data')) \ .selectExpr( "data.medallion as medallion_trip", "cast (data.pickup_time as timestamp) as pickup_time_trip", "cast (data.dropoff_time as timestamp)", "cast (data.passenger_count as integer)", "cast (data.trip_time as integer)", "cast (data.trip_distance as float)", "cast (data.pickup_loc.lat as float) as pickup_loc_lat", # "cast data.pickup_loc.lat as pickup_loc_lat" "cast (data.pickup_loc.lon as float) as pickup_loc_lon", # "cast data.pickup_loc.lon as pickup_loc_lon", "cast (data.dropoff_loc.lat as float) as dropoff_loc_lat", # "cast data.dropoff_loc.lat as dropoff_loc_lat", "cast (data.dropoff_loc.lon as float) as dropoff_loc_lon", # "cast data.dropoff_loc.lon as dropoff_loc_lon" ) print(self.df_trip.printSchema()) self.df = self.df_trip.join( self.df_fare, expr(""" medallion_trip = medallion_fare AND pickup_time_trip >= pickup_time_fare - interval 1 hour AND pickup_time_trip <= pickup_time_fare + interval 1 hour """) ) print((self.df \ .writeStream \ .outputMode("append") \ .format("console") \ .option('truncate','false') .option('numRows', 20) .start() .awaitTermination() )) query = self.windowedCounts.writeStream \ .outputMode("append") \ .queryName("writing_to_es") \ .format("org.elasticsearch.spark.sql") \ .option("checkpointLocation", "/tmp/1") \ .option("es.nodes", "localhost") \ .option("es.port", "9200") \ .option("es.resource", "nycfare2/_doc") \ query.start().awaitTermination()
def test_infer_schema_from_pandas_instances(self): def func() -> pd.Series[int]: pass inferred = infer_return_type(func) self.assertEqual(inferred.dtype, np.int64) self.assertEqual(inferred.spark_type, LongType()) def func() -> pd.Series[np.float]: pass inferred = infer_return_type(func) self.assertEqual(inferred.dtype, np.float64) self.assertEqual(inferred.spark_type, DoubleType()) def func() -> "pd.DataFrame[np.float, str]": pass expected = StructType( [StructField("c0", DoubleType()), StructField("c1", StringType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) def func() -> "pandas.DataFrame[np.float]": pass expected = StructType([StructField("c0", DoubleType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.float64]) self.assertEqual(inferred.spark_type, expected) def func() -> "pd.Series[int]": pass inferred = infer_return_type(func) self.assertEqual(inferred.dtype, np.int64) self.assertEqual(inferred.spark_type, LongType()) def func() -> pd.DataFrame[np.float, str]: pass expected = StructType( [StructField("c0", DoubleType()), StructField("c1", StringType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) def func() -> pd.DataFrame[np.float]: pass expected = StructType([StructField("c0", DoubleType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.float64]) self.assertEqual(inferred.spark_type, expected) pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) def func() -> pd.DataFrame[pdf.dtypes]: # type: ignore pass expected = StructType( [StructField("c0", LongType()), StructField("c1", LongType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.int64, np.int64]) self.assertEqual(inferred.spark_type, expected) pdf = pd.DataFrame({ "a": [1, 2, 3], "b": pd.Categorical(["a", "b", "c"]) }) def func() -> pd.Series[pdf.b.dtype]: # type: ignore pass inferred = infer_return_type(func) self.assertEqual(inferred.dtype, CategoricalDtype(categories=["a", "b", "c"])) self.assertEqual(inferred.spark_type, LongType()) def func() -> pd.DataFrame[pdf.dtypes]: # type: ignore pass expected = StructType( [StructField("c0", LongType()), StructField("c1", LongType())]) inferred = infer_return_type(func) self.assertEqual( inferred.dtypes, [np.int64, CategoricalDtype(categories=["a", "b", "c"])]) self.assertEqual(inferred.spark_type, expected)
mnistDF = spark.createDataFrame(pd_df) (trainingDF, validationDF) = mnistDF.randomSplit([0.8, 0.2]) trainingDF.show() # define loss with Pytorch API def lossFunc(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) torch_model = LeNet() model = TorchNet.from_pytorch(torch_model, [1, 1, 28, 28]) criterion = TorchCriterion.from_pytorch(lossFunc, [1, 10], torch.LongTensor([5])) classifier = NNClassifier(model, criterion, SeqToTensor([1, 28, 28])) \ .setBatchSize(256) \ .setOptimMethod(Adam()) \ .setLearningRate(0.001)\ .setMaxEpoch(2) nnClassifierModel = classifier.fit(trainingDF) print("After training: ") shift = udf(lambda p: p - 1, DoubleType()) res = nnClassifierModel.transform(validationDF) \ .withColumn("prediction", shift(col('prediction'))) res.show(100) correct = res.filter("label=prediction").count() overall = res.count() accuracy = correct * 1.0 / overall print("Validation accuracy = %g " % accuracy)
# MAGIC %md # MAGIC ### Step 1: Declare the schema. # MAGIC # MAGIC This is a list of field names and data types. # COMMAND ---------- from pyspark.sql.types import DoubleType, IntegerType, StringType, StructField, StructType csvSchema = StructType([ StructField("ProductID", IntegerType()), StructField("Name", StringType()), StructField("ProductNumber", StringType()), StructField("Color", StringType()), StructField("StandardCost", DoubleType()), StructField("ListPrice", DoubleType()), StructField("Size", StringType()), StructField("Weight", StringType()), StructField("ProductCategoryID", IntegerType()), StructField("ProductModelID", IntegerType()), StructField("SellStartDate", StringType()), StructField("SellEndDate", StringType()), StructField("DiscountedDate", StringType()), StructField("ThumbNailPhoto", StringType()), StructField("ThumbnailPhotoFileName", StringType()), StructField("rowguid", StringType()), StructField("ModifiedDate", StringType()) ]) # COMMAND ----------
StructField("ID", StringType(), True), StructField("CaseNumber", StringType(), True), StructField("Date", StringType(), True), StructField("Block", StringType(), True), StructField("IUCR", StringType(), True), StructField("PrimaryType", StringType(), True), StructField("Description", StringType(), True), StructField("LocationDescription", StringType(), True), StructField("Arrest", BooleanType(), True), StructField("Domestic", BooleanType(), True), StructField("Beat", StringType(), True), StructField("District", StringType(), True), StructField("Ward", StringType(), True), StructField("CommunityArea", StringType(), True), StructField("FBICode", StringType(), True), StructField("XCoordinate", DoubleType(), True), StructField("YCoordinate", DoubleType(), True), StructField("Year", IntegerType(), True), StructField("UpdatedOn", DateType(), True), StructField("Latitude", DoubleType(), True), StructField("Longitude", DoubleType(), True), StructField("Location", StringType(), True) ]) #crimes = spark.read.csv("gs://chic_crime/version1/ccd_sample.csv",header = True,schema = crimes_schema) crimes = spark.read.csv("Chicago_Crimes_2012_to_2017.csv", header=True, schema=crimes_schema) print(" The crimes dataframe has {} records".format(crimes.count())) print(crimes.select("PrimaryType").distinct().show(n=5))
# def close(self, error): # # Close the connection. This method in optional in Python. # pass streamingDF = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "time-series") \ .option('includeTimestamp', 'true') \ .load() deserialize_row_udf = udf(lambda x: deserialize_avro_column_row(x), DoubleType()) deserialized_value_dataframe = streamingDF.withColumn( 'deserialized_value', deserialize_row_udf("value")) deserialized_value_dataframe = deserialized_value_dataframe.select( ['key', 'timestamp', 'deserialized_value']) deserialized_value_dataframe.drop('value') deserialized_value_dataframe = deserialized_value_dataframe.withColumnRenamed( 'deserialized_value', 'value') # .outputMode("append")\ class ForeachWriter: def open(self, partition_id, epoch_id):
schemaNames = df_s.columns labels = schemaNames[0] feature_names = schemaNames[1:] column_names = [ 'labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b_tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb' ] # Changing labels types df_s = df_s.withColumn(labels, df_s[labels].cast(DoubleType()).cast(IntegerType())) # Changing types of data in columns for column in feature_names: df_s = df_s.withColumn(column, df_s[column].cast(DoubleType())) print("Split Train/Test data...") (trainingData, testData) = df_s.randomSplit([0.7, 0.3], 123) trainingData.cache() testData.cache() assembler = VectorAssembler(inputCols=feature_names, outputCol='features') trainingData = assembler.transform(trainingData).select("features", labels) testData = assembler.transform(testData).select("features", labels) trainingData.cache(), testData.cache()
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField from pyspark.sql.types import DoubleType, IntegerType, StringType spark = SparkSession\ .builder\ .appName("Test_number_of_stages")\ .getOrCreate() schema_ratings = StructType([ StructField("userId",IntegerType()),\ StructField("movieId", IntegerType()),\ StructField("rating", DoubleType()),\ StructField("timestamp",StringType()) ]) schema_movies = StructType([ StructField("ID_movie",IntegerType()),\ StructField("Name_movie",StringType()),\ StructField("Stype_movie",StringType()) ]) # stage load data df_ratings = spark.read.format("csv").schema(schema_ratings).load( "/nhatthanh/data/ml-20m/ratings.csv") df_movies = spark.read.format("csv").schema(schema_movies).load( "/nhatthanh/data/ml-20m/movies.csv") df_ratings.show() df_movies.show() # stage join # ra = df_ratings.alias('ra')
def setUpClass(cls): from datetime import date, datetime from decimal import Decimal super(ArrowTests, cls).setUpClass() cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set tz = "America/Los_Angeles" os.environ["TZ"] = tz time.tzset() cls.spark.conf.set("spark.sql.session.timeZone", tz) # Test fallback cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "false" cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false" # Enable Arrow optimization in this tests. cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # Disable fallback by default to easily detect the failures. cls.spark.conf.set( "spark.sql.execution.arrow.pyspark.fallback.enabled", "false") cls.schema_wo_null = StructType([ StructField("1_str_t", StringType(), True), StructField("2_int_t", IntegerType(), True), StructField("3_long_t", LongType(), True), StructField("4_float_t", FloatType(), True), StructField("5_double_t", DoubleType(), True), StructField("6_decimal_t", DecimalType(38, 18), True), StructField("7_date_t", DateType(), True), StructField("8_timestamp_t", TimestampType(), True), StructField("9_binary_t", BinaryType(), True), ]) cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True) cls.data_wo_null = [ ( "a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a"), ), ( "b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb"), ), ( "c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc"), ), ( "d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12), datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd"), ), ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
def test_wrong_args(self): left = self.data1 right = self.data2 with self.assertRaisesRegex(ValueError, "Invalid function"): left.groupby("id").cogroup(right.groupby("id")).applyInPandas( lambda: 1, StructType([StructField("d", DoubleType())]))
def test_infer_schema_with_names_pandas_instances(self): def func() -> 'pd.DataFrame["a" : np.float, "b":str]': # noqa: F821 pass expected = StructType( [StructField("a", DoubleType()), StructField("b", StringType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) def func() -> "pd.DataFrame['a': np.float, 'b': int]": # noqa: F821 pass expected = StructType( [StructField("a", DoubleType()), StructField("b", LongType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.float64, np.int64]) self.assertEqual(inferred.spark_type, expected) pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]: pass expected = StructType( [StructField("a", LongType()), StructField("b", LongType())]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.int64, np.int64]) self.assertEqual(inferred.spark_type, expected) pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]: pass expected = StructType([ StructField("(x, a)", LongType()), StructField("(y, b)", LongType()) ]) inferred = infer_return_type(func) self.assertEqual(inferred.dtypes, [np.int64, np.int64]) self.assertEqual(inferred.spark_type, expected) pdf = pd.DataFrame({ "a": [1, 2, 3], "b": pd.Categorical(["a", "b", "c"]) }) def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]: pass expected = StructType( [StructField("a", LongType()), StructField("b", LongType())]) inferred = infer_return_type(func) self.assertEqual( inferred.dtypes, [np.int64, CategoricalDtype(categories=["a", "b", "c"])]) self.assertEqual(inferred.spark_type, expected)
def _udf(f, returnType=DoubleType(), arg_type="pandas"): return FlintUserDefinedFunction(f, returnType, arg_type=arg_type)
def test_as_spark_type_koalas_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(koalas_dtype(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): koalas_dtype(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): koalas_dtype(np.dtype("object"))
def udf(f=None, returnType=DoubleType(), arg_type="pandas"): # Modified from # https://github.com/apache/spark/blob/master/python/pyspark/sql/functions.py # to add additional supports for Flint '''Creates a column expression representing a user defined function (UDF). This behaves the same as :meth:`~pyspark.sql.functions.udf` when used with a PySpark function, such as :meth:`~pyspark.sql.DataFrame.withColumn`. This can also be used with Flint functions, such as :meth:`ts.flint.TimeSeriesDataFrame.summarizeCycles`. This can be used to define a row user define function or a columnar user define function: 1. Row udf A row udf takes one or more scalar values for each row, and returns a scalar value for that row. A :class:`~pyspark.sql.Column` object is needed to specifiy the input, for instance, ``df['v']``. Example: >>> @udf(DoubleType()) >>> def plus_one(v): ... return v+1 >>> col = plus_one(df['v']) 2. Pandas Columnar udf A pandas columnar udf takes one or more :class:`pandas.Series` or :class:`pandas.DataFrame` as input, and returns either a scalar value or a :class:`pandas.Series` as output. If the user function takes :class:`pandas.Series`, a :class:`~pyspark.sql.Column` is needed to specify the input, for instance, ``df['v']``. If the user function takes a :class:`pandas.DataFrame`, a :class:`~pyspark.sql.DataFrame` is needed to specify the input, for instance, ``df[['v', 'w']]``. Default return type is DoubleType. Example: Takes :class:`pandas.Series`, returns a scalar >>> @udf(DoubleType()) >>> def weighted_mean(v, w): ... return numpy.average(v, weights=w) >>> col = weighted_mean(df['v'], df['w']) Takes a :class:`pandas.DataFrame`, returns a scalar >>> @udf(DoubleType()) >>> def weighted_mean(df): ... return numpy.average(df.v, weighted=df.w) >>> col = weighted_mean(df[['v', 'w']]) Takes a :class:`pandas.Series`, returns a :class:`pandas.Series` >>> @udf(DoubleType()) >>> def percent_rank(v): ... return v.rank(pct=True) >>> col = percent_rank(df['v']) Different functions take different types of udf. For instance, * :meth:`pyspark.sql.DataFrame.withColumn` takes a row udf * :meth:`ts.flint.TimeSeriesDataFrame.summarizeCycles` takes a columnar udf that returns a scalar value. 3. Numpy Columnar udf Numpy columnar udf is similar to pandas columnar udf. The main difference is numpy udf expects the function input to be numpy data structure and types, i.e., numpy.ndarray or numpy.flaat64. When a named input is expected, the input to the udf would be a python ordered dict from str to numpy.ndarray or numpy primitive type. Numpy columnar udf is faster than pandas columnar udf, particularly in summarizeWindows, where the overhead of creating pandas.Series and pandas.DataFrame for each window can be large. Therefore, user should try to use numpy columnar udf with summarizeWindows. Examples: >>> @udf(DoubleType(), arg_type='numpy') >>> def mean_udf(v): ... # v is numpy.ndarray ... return v.mean() >>> col = mean_udf(df['v']) .. seealso:: :meth:`ts.flint.TimeSeriesDataFrame.summarizeCycles` :meth:`ts.flint.TimeSeriesDataFrame.addColumnsForCycles` :meth:`ts.flint.TimeSeriesDataFrame.summarizeIntervals` :meth:`ts.flint.TimeSeriesDataFrame.summarizeWindows` ''' def _udf(f, returnType=DoubleType(), arg_type="pandas"): return FlintUserDefinedFunction(f, returnType, arg_type=arg_type) # decorator @udf, @udf(), @udf(dataType()) or @udf((dataType(), dataType())) if f is None or isinstance(f, (str, tuple, DataType)): # If DataType has been passed as a positional argument # for decorator use it as a returnType return_type = f or returnType return_type = _wrap_data_types(return_type) return functools.partial(_udf, returnType=return_type, arg_type=arg_type) else: return_type = _wrap_data_types(returnType) return _udf(f=f, returnType=return_type, arg_type=arg_type)
# structure from tweet dtypes = StructType([ StructField("created_at", TimestampType(), True), StructField("tweet_id", StringType(), False), StructField("tweet", StringType(), False), StructField("likes", DecimalType(38, 0), False), StructField("retweet_count", DecimalType(38, 0), False), StructField("source", StringType(), True), StructField("user_id", DecimalType(38, 0), False), StructField("user_name", StringType(), True), StructField("user_screen_name", StringType(), False), StructField("user_description", StringType(), True), StructField("user_join_date", TimestampType(), True), StructField("user_followers_count", DecimalType(38, 0), False), StructField("user_location", StringType(), True), StructField("lat", DoubleType(), True), StructField("long", DoubleType(), True), StructField("city", StringType(), True), StructField("country", StringType(), True), StructField("continent", StringType(), True), StructField("state", StringType(), True), StructField("state_code", StringType(), True), StructField("collected_at", TimestampType(), False) ]) if __name__ == "__main__": filePaths = [ "Resources/hashtag_donaldtrump.csv", "Resources/hashtag_joebiden.csv" ] schemes = [dtypes, dtypes] spark = createSpark("tweet creator")
#---------------------------------------------------------------------------- ## Main functionality if __name__ == "__main__": main_config_file_filter = None errorCount = 0 workflowStartTime = datetime.datetime.now() if len(sys.argv) > 1: main_config_file = sys.argv[1] if len(sys.argv) > 2: main_config_file_filter = sys.argv[2] spark.udf.register('udfConvertInt', convertInt, IntegerType()) spark.udf.register('udfConvertDouble', convertDouble, DoubleType()) spark.udf.register('udfConvertDatetime', convertDatetime, TimestampType()) mainConfig = spark.read.load(main_config_file, format="csv", delimiter="|", header=True) #Opretaion|LoadType|threads|Server|Database|t|WhereClause|DeltaColumn|UniqueIdentifiers|PartitionColumn|TargetLocationRaw|TargetLocationCooked|TargetLocationTableSchema|HiveDatabase|HiveTable|Comments if (main_config_file_filter is not None): mainConfig = mainConfig.filter(main_config_file_filter) for row in mainConfig.collect(): try: print(
return False return True begin_re_pubmed = re.compile("^====") def is_text_pubmed(line): line = line.strip() if not line or begin_re_pubmed.match(line): return False return True schema = StructType([ StructField("fullText", StringType(), True), StructField("category", DoubleType(), False) ]) def load_article_wiki(category_name, category_id): text_file = spark.sparkContext.textFile("{}/*".format(category_name)) return text_file.filter(is_text_wiki).map( lambda l: (l, float(category_id))).toDF(schema) def load_article_pubmed(category_name, category_id): text_file = spark.sparkContext.textFile("{}/*".format(category_name)) return text_file.filter(is_text_wiki).map( lambda l: (l, float(category_id))).toDF(schema) bio_articles = load_article_pubmed("pubmed-AF-combine", 0) other_articles = load_article_wiki("enwiki", 1)
def noaa_s3_to_postgres(year): # first, handle the station info file station_schema = StructType([StructField('stationid', StringType(), True),\ StructField('latitude', DoubleType(), True),\ StructField('longitude', DoubleType(), True),\ StructField('elevation', DoubleType(), True),\ StructField('state', StringType(), True)]) spark = SparkSession.builder.appName("Spark").config( "spark.driver.extraClassPath", "/home/ubuntu").config('spark.executor.extraClassPath', '/home/ubuntu').getOrCreate() station_data = spark.read.csv( "s3a://yearly-weather-data/ghcnd-stations.csv", header=False, schema=station_schema) # now, deal with the annual file noaa_data_schema = StructType([StructField('stationid', StringType(), True),\ StructField('obsdate', StringType(), True),\ StructField('element', StringType(), True),\ StructField('dataval', StringType(), True),\ StructField('mflag', StringType(), True),\ StructField('qflag', StringType(), True),\ StructField('sflag', StringType(), True),\ StructField('obstime', StringType(), True)]) # first, load file from S3 file_path = "s3a://yearly-weather-data/{}.csv".format(year) noaa_data = spark.read.csv(file_path, header=False, schema=noaa_data_schema) # clean data noaa_data = noaa_data.filter(noaa_data['element'].contains('TMAX')) noaa_data = noaa_data.filter(noaa_data['stationid'].contains('US')) noaa_data = noaa_data.filter(noaa_data['qflag'].isNull()) # add month column noaa_data = noaa_data.withColumn( 'month', substring('obsdate', 5, 2).cast(IntegerType())) # join to station data noaa_data = noaa_data.join(station_data, 'stationid', 'inner').drop('elevation', 'state', 'qflag', 'obstime') # group data by month, longitude, latitude. then do average monthly_noaa_data = noaa_data.groupBy( 'month', 'longitude', 'latitude').agg(avg(col('dataval')).alias('dataval')) # now load noaa data to PostgreSQL newConnection = get_connection_by_config('database.ini', 'postgresql_conn_data') cursor = newConnection.cursor() noaa_table = ''' DROP TABLE IF EXISTS noaa_{0}_avg; CREATE TABLE noaa_{0}_avg ( month INTEGER, dataval INTEGER, latitude REAL, longitude REAL ); '''.format(year) cursor.execute(noaa_table) # add selected cols from dataframe to database insert_command = ''' INSERT INTO noaa_{}_avg(month, dataval, latitude, longitude) VALUES %s '''.format(year) noaa_arr = get_noaa_simple_array(monthly_noaa_data) execute_values(cursor, insert_command, noaa_arr, page_size=500) # add column with postgis point collComand = 'ALTER TABLE noaa_{}_avg ADD COLUMN geogcol geography(Point, 4326);'.format( year) cursor.execute(collComand) updateComand = 'UPDATE noaa_{}_avg SET geogcol = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);'.format( year) cursor.execute(updateComand) # add index indexComand = 'CREATE INDEX noaa_{0}_geog_index ON noaa_{0}_avg (geogcol) ;'.format( year) cursor.execute(indexComand) # commit changes to database, close connection newConnection.commit() cursor.close() newConnection.close() print('Finished processing NOAA data year ' + year)
from zoo.pipeline.api.keras.layers import Dense, Input, Flatten from zoo.pipeline.api.keras.models import * from zoo.pipeline.api.net import * from zoo.pipeline.nnframes import * sc = init_nncontext("ImageTransferLearningExample") model_path = "hdfs:///user/leelau/zoo/demo/bigdl_inception-v1_imagenet_0.4.0.model" image_path = "hdfs:///user/leelau/zoo/demo/*/*" imageDF = NNImageReader.readImages(image_path, sc) getName = udf( lambda row: re.search(r'(cat|dog)\.([\d]*)\.jpg', row[0], re.IGNORECASE). group(0), StringType()) getLabel = udf(lambda name: 1.0 if name.startswith('cat') else 2.0, DoubleType()) labelDF = imageDF.withColumn("name", getName(col("image"))) \ .withColumn("label", getLabel(col('name'))) (trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1]) labelDF.select("name", "label").show(10) # Fine-tune a pre-trained model # We fine-tune a pre-trained model by removing the last few layers, freezing the first few layers, and adding some new layers. transformer = ChainedPreprocessing([ RowToImageFeature(), ImageResize(256, 256), ImageCenterCrop(224, 224), ImageChannelNormalize(123.0, 117.0, 104.0), ImageMatToTensor(), ImageFeatureToTensor()
def epa_s3_to_postgres(pollutant_name, year): pollutant_name_to_code = {'ozone': 44201, 'pm25': 88101, 'no2': 42602} epa_data_schema = StructType([StructField('statecode', StringType(), True),\ StructField('countycode', StringType(), True),\ StructField('sitecode', StringType(), True),\ StructField('parameter_code', IntegerType(), True),\ StructField('poc', IntegerType(), True),\ StructField('latitude', DoubleType(), True),\ StructField('longitude', DoubleType(), True),\ StructField('datum', StringType(), True),\ StructField('pollutantname', StringType(), True),\ StructField('sample_duration', StringType(), True),\ StructField('pollutant_standard', StringType(), True),\ StructField('obsdate', StringType(), True), \ StructField('unit', StringType(), True),\ StructField('event_type', StringType(), True),\ StructField('observation_count', IntegerType(), True),\ StructField('observation_percent', DoubleType(), True),\ StructField('dataval', DoubleType(), True),\ StructField('first_max_value', DoubleType(), True),\ StructField('first_max_hour', IntegerType(), True),\ StructField('aqi', IntegerType(), True),\ StructField('method_code', StringType(), True),\ StructField('method_name', StringType(), True),\ StructField('local_site_name', StringType(), True),\ StructField('address', StringType(), True),\ StructField('state_name', StringType(), True),\ StructField('county_name', StringType(), True),\ StructField('city_name', StringType(), True),\ StructField('cbsa_name', StringType(), True),\ StructField('date_of_last_change', DateType(), True)]) spark = SparkSession.builder.appName("Spark").config( "spark.driver.extraClassPath", "/home/ubuntu").config('spark.executor.extraClassPath', '/home/ubuntu').getOrCreate() # first, load file from S3 file_path = 's3a://epa-aq-data/daily_{0}_{1}.csv'.format( pollutant_name_to_code[pollutant_name], year) epa_data = spark.read.csv(file_path, header=True, schema=epa_data_schema) print('got the epa spark data frame!') # add month, do averaging by month epa_data = epa_data.withColumn( 'month', substring('obsdate', 6, 2).cast(IntegerType())) monthly_epa_data = epa_data.groupBy('month', 'longitude', 'latitude').agg( avg(col('dataval')).alias('dataval')) # now load epa data to PostgreSQL newConnection = get_connection_by_config('database.ini', 'postgresql_conn_data') cursor = newConnection.cursor() pollutantTable = ''' DROP TABLE IF EXISTS {0}_{1}_avg; CREATE TABLE {0}_{1}_avg ( month INTEGER, dataval REAL, latitude REAL, longitude REAL ); '''.format(pollutant_name, year) cursor.execute(pollutantTable) print("made pollutant table in epageo database") # add selected cols from dataframe to database insert_command = ''' INSERT INTO {0}_{1}_avg(month, dataval, latitude, longitude) VALUES %s '''.format(pollutant_name, year) epa_arr = get_epa_simple_array(monthly_epa_data) execute_values(cursor, insert_command, epa_arr, page_size=500) # add column with postgis point collComand = 'ALTER TABLE {0}_{1}_avg ADD COLUMN geogcol geography(Point, 4326);'.format( pollutant_name, year) cursor.execute(collComand) updateComand = 'UPDATE {0}_{1}_avg SET geogcol = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);'.format( pollutant_name, year) cursor.execute(updateComand) # add index indexComand = 'CREATE INDEX {0}_{1}_geog_index ON {0}_{1}_avg (geogcol) ;'.format( pollutant_name, year) cursor.execute(indexComand) # commit changes to database, close connection newConnection.commit() cursor.close() newConnection.close() print('Finished processing epa data pollutant {0} year {1}'.format( pollutant_name, year))
if __name__ == "__main__": sc = SparkContext() spark = SparkSession.builder.master("local").appName("Word Count").config( "spark.some.config.option", "some-value").getOrCreate() num_trees = 50 max_depth = 25 df_train, df_test = get_dataframe(NUM_FEA) # Random Forest Classification ########################################################################## rf = RandomForestClassifier(numTrees=num_trees, maxDepth=max_depth) model = rf.fit( df_train.withColumn("label", df_train["label"].cast(DoubleType()))) pred = model.transform(df_test) pred = pred.withColumn("prediction", pred["prediction"].cast("int")) y_test = pred.select( "docid", "prediction").rdd.map(tuple).sortByKey().map(lambda x: x[1]).collect() # Accuracy # rdd_ytest = sc.textFile('gs://uga-dsp/project1/files/y_small_test.txt') # accuracy = cal_accuracy(rdd_ytest.collect(), y_test) # print('Testing Accuracy: %.2f %%' % (accuracy*100)) # print('**********************************************') # Output file
print("Fitting for Submittal") predictions = pipeline_model.transform(df) predictions.select("MachineIdentifier", "probability", "prediction").show(truncate=False) print("Creating CSV for Submittal") # Silly workaround for extracting an element from a dense or sparse vector. Probability column is a vector, with probs for each label # https://stackoverflow.com/questions/39555864/how-to-access-element-of-a-vectorudt-column-in-a-spark-dataframe def vector_item_(vector_column, index): try: return float(vector_column[index]) except ValueError: return None vector_item = F.udf(vector_item_, DoubleType()) df_submit = predictions.withColumn("Label_0", vector_item("probability", F.lit(0))) df_submit = df_submit.withColumn("Label_1", vector_item("probability", F.lit(1))) df_submit = df_submit.withColumn("HasDetections", df_submit.Label_1) df_submit = df_submit.select("MachineIdentifier", "HasDetections") # Yet another workaround to write to a CSV file df_submit.coalesce(1).toPandas().to_csv(csv_path, header=True, index=False) print("Total rows written to file: {0}".format(df_submit.count()))
nflSubset = nflDF[columns] nflSubset = nflSubset.where((nflSubset['PlayType'] == 'Pass') | (nflSubset['PlayType'] == 'Run')) nflClean = nflSubset.dropna() # COMMAND ---------- intColumns = [ "down", "TimeSecs", "PlayTimeDiff", "yrdln", "yrdline100", "PosTeamScore", "DefTeamScore", "AbsScoreDiff" ] for col in intColumns: nflClean = nflClean.withColumn(col, nflClean[col].cast(DoubleType())) nflClean = nflClean.na.fill(0) # COMMAND ---------- labelIndexer = StringIndexer(inputCol="PlayType", outputCol="indexedLabel").fit(nflClean) # Converting all categorical variables into factor indexes # All string values must be in a numerical format, unlike R, you are not able to create STRING "Factor" levels PosTeamIndexer = StringIndexer(inputCol="posteam", outputCol="indexedPosTeam") DefTeamIndexer = StringIndexer(inputCol="DefensiveTeam", outputCol="indexedDefTeam") HomeTeamIndexer = StringIndexer(inputCol="HomeTeam", outputCol="indexedHomeTeam")
CREATE TABLE customer_counts USING DELTA LOCATION '{}' """.format(CustomerCountsPath)) # COMMAND ---------- from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType inputSchema = StructType([ StructField("InvoiceNo", IntegerType(), True), StructField("StockCode", StringType(), True), StructField("Description", StringType(), True), StructField("Quantity", IntegerType(), True), StructField("InvoiceDate", StringType(), True), StructField("UnitPrice", DoubleType(), True), StructField("CustomerID", IntegerType(), True), StructField("Country", StringType(), True) ]) # COMMAND ---------- newDataPath = "/mnt/training/online_retail/outdoor-products/outdoor-products-small.csv" spark.sql("DROP TABLE IF EXISTS new_customer_counts") newDataDF = (spark.read.option("header", "true").schema(inputSchema).csv(newDataPath)) (newDataDF.groupBy("CustomerID", "Country").count().withColumnRenamed( "count", "total_orders").write.saveAsTable("new_customer_counts"))
print('QUANTIDADE DE REGISTROS: {}'.format(df_clima.count())) df_clima.printSchema() # Efetua a leitura do csv de parametros de ra print('EFETUANDO LEITURA DO CSV DE PARAMETROS DE Ra') df_parametro_ra = spark.read.csv( 'C:/projeto/TCC-PUPUNHA/datasets/parametroRa.csv', header=True, sep=';') print(df_parametro_ra.columns) print('QUANTIDADE DE REGISTROS: {}'.format(df_parametro_ra.count())) df_parametro_ra.printSchema() fields_list = ['prcp', 'temp', 'tmax', 'tmin'] print('ALTERANDO AS VARIAVEIS {} DO DATAFRAME DF_CLIMA PARA DOUBLE'.format( fields_list)) for name in fields_list: df_clima = df_clima.withColumn(name, df_clima[name].cast(DoubleType())) print('ALTERANDO AS VARIAVEIS {} DO DATAFRAME DF_PARAMETRO_RA PARA DOUBLE'. format(df_parametro_ra.columns)) for name in df_parametro_ra.columns: df_parametro_ra = df_parametro_ra.withColumn( name, df_parametro_ra[name].cast(DoubleType())) cidades_list = [ 'Pariquera-Açu', 'Barra do Turvo', 'Itariri', 'Cananéia', 'Pedro de Toledo', 'Iporanga', 'Eldorado', 'Miracatu', 'Cajati', 'Sete Barras', 'Juquiá', 'Jacupiranga', 'Ilha Comprida', 'Registro', 'Iguape' ] # seleciona as cidades presentes na lista 'cidades' print('SELECIONANDO CIDADES PRESENTES NO VALE DO RIBEIRA')
def __init__(self): """ Each element in attributes stands for some data that the holoclean needs to create: id : unique id for the dataset and it will be used in registering and retrieving data Init : initial data that get to the database from a file that user gives C_clean : table with index of clean cells C_dk : table of indices that we don't know if they are noisy or clean C_dk_temp : table of indices that we don't know if they are noisy or clean based on the dcs C_dk_temp_null : table of indices that we don't know if they are noisy or clean based on the null cells T1_attributes: attributes of the first tuple in dc grounding T2_attributes: attributes of the second tuple in dc grounding Possible_values: table of all possible values for the do not know cells Observed_Possible_values_clean : table with the observed values for the clean cells Observed_Possible_values_dk : table with the observed values for the do not know cells C_clean_flat: table for the clean cells that are flatted on three columns (index, attribute, and value) C_dk_flat: table for the dk cells that are flatted on three columns (index, attribute, and value) Kij_lookup: table with the cardinality of the domain for each cell Init_join: self join of init table Map_schema: table with the schema of the Init table Init_flat_join: self join of C_clean_flat table Init_flat_join_dk: self join of C_dk_flat table Feature_id_map: table that maps each feature to a number Sources: table that maps each source to a number Sources_temp: temporary table for saving the sources Attribute_temp: temporary table for saving the attributes Dimensions_clean: table with the dimensions for the X tensor for training Dimensions_dk: table with the dimensions for the X tensor for learning Inferred_values: table with the inferred values Repaired_dataset: dataset table after we apply repairs to initial data Correct: table with the correct values for our dataset Correct_flat: table with the correct data that are flatted on three columns (index, attribute, and value) Feature: table with feature value for each random variable and assigned value """ # holds casting information to cast from pyspark datatype to python types self.type_dict = { IntegerType().simpleString(): int, StringType().simpleString(): unicode, DoubleType().simpleString(): float, LongType().simpleString(): int, FloatType().simpleString(): float } self.sql_type_dict = { IntegerType().simpleString(): 'INTEGER', StringType().simpleString(): 'VARCHAR(255)', DoubleType().simpleString(): 'DOUBLE PRECISION', LongType().simpleString(): 'BIGINT', FloatType().simpleString(): 'REAL' } self.attribute = {} self.schema = "" self.dataset_tables_specific_name = [] self.dataset_id = self._id_generator() self.attributes = { 'id': [], 'Init': [], 'C_clean': [], 'C_dk': [], 'C_dk_temp': [], 'C_dk_temp_null': [], 'T1_attributes': [], 'T2_attributes': [], 'Possible_values': StructType([ StructField("vid", IntegerType(), True), StructField("tid", IntegerType(), False), StructField("attr_name", StringType(), False), StructField("attr_val", StringType(), False), StructField("observed", IntegerType(), False), StructField("domain_id", IntegerType(), True) ]), 'Observed_Possible_values_clean': [], 'Observed_Possible_values_dk': [], 'C_clean_flat': StructType([ StructField("tid", IntegerType(), False), StructField("attribute", StringType(), False), StructField("value", StringType(), True) ]), 'C_dk_flat': StructType([ StructField("tid", IntegerType(), False), StructField("attribute", StringType(), False), StructField("value", StringType(), True) ]), 'Kij_lookup': StructType([ StructField("vid", IntegerType(), True), StructField("tid", IntegerType(), False), StructField("attr_name", StringType(), False), StructField("k_ij", IntegerType(), False), ]), 'Init_join': [], 'Map_schema': StructType([ StructField("attr_id", IntegerType(), False), StructField("attribute", StringType(), True) ]), 'Init_flat_join_dk': [], 'Init_flat_join': [], 'Feature_id_map': StructType([ StructField("feature_ind", IntegerType(), True), StructField("attribute", StringType(), False), StructField("value", StringType(), False), StructField("Type", StringType(), False), ]), 'Sources': [], 'Sources_temp': [], 'Attribute_temp': [], 'Dimensions_clean': [], 'Dimensions_dk': [], 'Inferred_values': StructType([ StructField("probability", DoubleType(), False), StructField("vid", IntegerType(), False), StructField("attr_name", StringType(), False), StructField("attr_val", StringType(), False), StructField("tid", IntegerType(), False), StructField("domain_id", IntegerType(), False) ]), 'Repaired_dataset': [], 'Correct': [], 'Correct_flat': [], 'Feature': StructType([ StructField("vid", IntegerType(), False), StructField("assigned_val", IntegerType(), False), StructField("feature", IntegerType(), False), StructField("count", IntegerType(), False) ])}