model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i) updated_model = model_name+str(i) data_path = model_path+"/data/part-r*" metadata_path = model_path+"/metadata/part-00000" if(patherror(data_path) == False and patherror(metadata_path) == False): break #load model classifier model = NaiveBayesModel.load(sc, model_path) start = time.time() reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf)
def convert( spark: SparkSession, dataset_root: str, limit: int = 0, asset_dir: Optional[str] = None, ) -> DataFrame: """Convert a Coco Dataset into Rikai dataset. This function expects the COCO datasets are stored in directory with the following structure: - dataset - annotations - captions_train2017.json - instances_train2017.json - ... - train2017 - val2017 - test2017 Parameters ---------- spark : SparkSession A live spark session dataset_root : str The directory of dataset limit : int, optional The number of images of each split to be converted. asset_dir : str, optional The asset directory to store images, can be a s3 directory. Return ------ DataFrame Returns a Spark DataFrame """ train_json = os.path.join(dataset_root, "annotations", "instances_train2017.json") val_json = os.path.join(dataset_root, "annotations", "instances_val2017.json") categories = load_categories(train_json) examples = [] for split, anno_file in zip(["train", "val"], [train_json, val_json]): coco = COCO(annotation_file=anno_file) # Coco has native dependencies, so we do not distributed them # to the workers. image_ids = coco.imgs if limit > 0: image_ids = islice(image_ids, limit) for image_id in image_ids: ann_id = coco.getAnnIds(imgIds=image_id) annotations = coco.loadAnns(ann_id) annos = [] for ann in annotations: bbox = Box2d.from_top_left(*ann["bbox"]) annos.append({ "category_id": ann["category_id"], "category_text": categories[ann["category_id"]]["name"], "bbox": bbox, "area": float(ann["area"]), }) image_payload = coco.loadImgs(ids=image_id)[0] example = { "image_id": image_id, "annotations": annos, "image": Image( os.path.abspath( os.path.join( dataset_root, "{}2017".format(split), image_payload["file_name"], ))), "split": split, } examples.append(example) schema = StructType([ StructField("image_id", LongType(), False), StructField( "annotations", ArrayType( StructType([ StructField("category_id", IntegerType()), StructField("category_text", StringType()), StructField("area", FloatType()), StructField("bbox", Box2dType()), ])), False, ), StructField("image", ImageType(), False), StructField("split", StringType(), False), ]) df = spark.createDataFrame(examples, schema=schema) if asset_dir: asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/" print("ASSET DIR: ", asset_dir) df = df.withColumn("image", image_copy(col("image"), lit(asset_dir))) return df
class DeltaTableTests(PySparkTestCase): def setUp(self): super(DeltaTableTests, self).setUp() self.sqlContext = SQLContext(self.sc) self.spark = SparkSession(self.sc) self.tempPath = tempfile.mkdtemp() self.tempFile = os.path.join(self.tempPath, "tempFile") def tearDown(self): self.spark.stop() shutil.rmtree(self.tempPath) super(DeltaTableTests, self).tearDown() def test_forPath(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)]) def test_alias_and_toDF(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer( dt.alias("myTable").select('myTable.key', 'myTable.value'), [('a', 1), ('b', 2), ('c', 3)]) def test_history(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)]) dt = DeltaTable.forPath(self.spark, self.tempFile) operations = dt.history().select('operation') self.__checkAnswer( operations, [Row("WRITE"), Row("WRITE")], StructType([StructField("operation", StringType(), True)])) lastMode = dt.history(1).select('operationParameters.mode') self.__checkAnswer( lastMode, [Row("Overwrite")], StructType( [StructField("operationParameters.mode", StringType(), True)])) def test_vacuum(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile) self.__createFile('abc.txt', 'abcde') self.__createFile('bac.txt', 'abcdf') self.assertEqual(True, self.__checkFileExists('abc.txt')) dt.vacuum() # will not delete files as default retention is used. self.assertEqual(True, self.__checkFileExists('bac.txt')) retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled" self.spark.conf.set(retentionConf, "false") dt.vacuum(0.0) self.spark.conf.set(retentionConf, "true") self.assertEqual(False, self.__checkFileExists('bac.txt')) self.assertEqual(False, self.__checkFileExists('abc.txt')) def test_convertToDelta(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) self.tempFile2 = self.tempFile + "_" dt = DeltaTable.convertToDelta(self.spark, "parquet.`" + self.tempFile + "`") self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work df.write.partitionBy("value").format("parquet").save(self.tempFile2) schema = StructType() schema.add("value", IntegerType(), True) dt = DeltaTable.convertToDelta(self.spark, "parquet.`" + self.tempFile2 + "`", schema) self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile2), [('a', 1), ('b', 2), ('c', 3)]) def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]): if not expectedAnswer: self.assertEqual(df.count(), 0) return expectedDF = self.spark.createDataFrame(expectedAnswer, schema) self.assertEqual(df.count(), expectedDF.count()) self.assertEqual(len(df.columns), len(expectedDF.columns)) self.assertEqual([], df.subtract(expectedDF).take(1)) self.assertEqual([], expectedDF.subtract(df).take(1)) def __writeDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").save(self.tempFile) def __overwriteDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").mode("overwrite").save(self.tempFile) def __createFile(self, fileName, content): with open(os.path.join(self.tempFile, fileName), 'w') as f: f.write(content) def __checkFileExists(self, fileName): return os.path.exists(os.path.join(self.tempFile, fileName))
line = [int(ele) for ele in line] return ((line[0], line[1:])) #item_user_mat=sc.textFile("/Users/sohinimitra/Documents/itemusermat").map(lambda x: x.split(" ")).map(lambda x: [[x[0],x[1:]] for y in x]) item_user_mat = sc.textFile("/Users/sohinimitra/Documents/itemusermat").map( lambda x: x.split(" ")) item_user_mat = item_user_mat.map(getKeyValue) ratings = item_user_mat.map(lambda x: x[0]).zipWithIndex().map(lambda x: (x[1], x[0])) data = [(Vectors.dense(x[1]), ) for x in item_user_mat.collect()] item_user_mat_df = spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=10, seed=1) model = kmeans.fit(item_user_mat_df) transformed = model.transform(item_user_mat_df).select("features", "prediction") transformed_with_index = transformed.rdd.zipWithIndex() rows = transformed_with_index.collect() prediction_with_index = sc.parallelize(rows).map(lambda x: (x[1], x[0].prediction)) ratingsPrediction = ratings.join(prediction_with_index).map(lambda x: x[1]) movie = sc.textFile("/Users/sohinimitra/Documents/movies.dat").map( lambda x: x.split("::")).map(lambda x: (int(x[0]), (x[1], x[2])))
spark = SparkSession(sc) #Load dataset file as RDD rdd = sc.textFile("/user/spark/airfoil.txt") rdd = rdd.map(lambda x: x.split('\t')) rdd = rdd.map(lambda x: [ float(x[0]), float(x[1]), float(x[2]), float(x[3]), float(x[4]), float(x[5]) ]) #Create dataframe for ML model df = spark.createDataFrame( rdd, ["frequency", "angle", "chord", "velocity", "suction", "pressure"]) data = df.rdd.map(lambda x: (DenseVector(x[:-1]), x[-1])) df = spark.createDataFrame(data, ["features", "label"]) #Feature scaling standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(df) scaled_df = scaler.transform(df) #Split data into training and test train_data, test_data = scaled_df.randomSplit([.7, .3], seed=1234) train_data = train_data.select("features_scaled", "label") test_data = test_data.select("features_scaled", "label") train_data = train_data.withColumnRenamed("features_scaled", "features") test_data = test_data.withColumnRenamed("features_scaled", "features")
# In[116]: from pyspark.sql.window import Window from pyspark.sql.functions import rank, col def toCSV(_, records): for x in records: if x[0][1] == 0: product = '"{}"'.format(product) yield ','.join( (product, year, str(total), str(companies), str(top_percent))) rdd = sc.parallelize(countsPerNeighborhood) df = spark.createDataFrame(rdd) df1 = Window.partitionBy(df[1][1]).orderBy(df[1][0].desc()) df = df.select('*', rank().over(df1).alias('rank')).filter(col('rank') <= 3) # In[122]: from pyspark.sql import functions as f from pyspark.sql import types as t def newCols(x): return names[x] finaldf = f.udf(newCols, t.StringType())
def test_multiple_join( spark: SparkSession, composite_entity_schema: StructType, customer_feature_schema: StructType, driver_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=2)), (1001, 8002, datetime(year=2020, month=9, day=2)), (2001, 8002, datetime(year=2020, month=9, day=3)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) customer_table_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 100.0, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 200.0, ), ] customer_table_df = spark.createDataFrame( spark.sparkContext.parallelize(customer_table_data), customer_feature_schema) customer_table = FeatureTable( name="transactions", features=[Field("daily_transactions", "double")], entities=[Field("customer_id", "int32")], max_age=86400, ) customer_table_df = filter_feature_table_by_time_range( customer_table_df, customer_table, "event_timestamp", entity_df, "event_timestamp", ) driver_table_data = [ ( 8001, datetime(year=2020, month=8, day=31), datetime(year=2020, month=8, day=31), 200, ), ( 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 300, ), ( 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 600, ), ( 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 500, ), ] driver_table_df = spark.createDataFrame( spark.sparkContext.parallelize(driver_table_data), driver_feature_schema) driver_table = FeatureTable( name="bookings", features=[Field("completed_bookings", "int32")], entities=[Field("driver_id", "int32")], max_age=7 * 86400, ) driver_table_df = filter_feature_table_by_time_range( driver_table_df, driver_table, "event_timestamp", entity_df, "event_timestamp", ) joined_df = join_entity_to_feature_tables( entity_df, "event_timestamp", [customer_table_df, driver_table_df], [customer_table, driver_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), StructField("bookings__completed_bookings", IntegerType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300, ), ( 1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
import findspark findspark.init() from pyspark import SparkContext from pyspark.ml.feature import OneHotEncoder, StringIndexer from pyspark.sql import SQLContext, SparkSession sc = SparkContext("local", "Features - OneHotEncoder") spark = SparkSession(sc) df = spark.createDataFrame([ (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") ], ["id", "category"]) stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec") #encoder.setDropLast(False) encoded = encoder.transform(indexed) encoded.show() spark.stop()
def test_join_with_composite_entity( spark: SparkSession, composite_entity_schema: StructType, rating_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=1)), (1001, 8002, datetime(year=2020, month=9, day=3)), (1001, 8003, datetime(year=2020, month=9, day=1)), (2001, 8001, datetime(year=2020, month=9, day=2)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) feature_table_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 3.0, 5.0, ), ( 1001, 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 4.0, 3.0, ), ( 2001, 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 4.0, 4.5, ), ] feature_table_df = spark.createDataFrame( spark.sparkContext.parallelize(feature_table_data), rating_feature_schema, ) feature_table = FeatureTable( name="ratings", features=[ Field("customer_rating", "double"), Field("driver_rating", "double") ], entities=[Field("customer_id", "int32"), Field("driver_id", "int32")], max_age=86400, ) feature_table_df = filter_feature_table_by_time_range( feature_table_df, feature_table, "event_timestamp", entity_df, "event_timestamp", ) joined_df = as_of_join( entity_df, "event_timestamp", feature_table_df, feature_table, ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("ratings__customer_rating", FloatType()), StructField("ratings__driver_rating", FloatType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=1), 3.0, 5.0, ), (1001, 8002, datetime(year=2020, month=9, day=3), None, None), (1001, 8003, datetime(year=2020, month=9, day=1), None, None), ( 2001, 8001, datetime(year=2020, month=9, day=2), 4.0, 4.5, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def test_select_subset_of_columns_as_entity_primary_keys( spark: SparkSession, composite_entity_schema: StructType, customer_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=2)), (2001, 8002, datetime(year=2020, month=9, day=2)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) feature_table_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 100.0, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 400.0, ), ] feature_table_df = spark.createDataFrame( spark.sparkContext.parallelize(feature_table_data), customer_feature_schema) feature_table = FeatureTable( name="transactions", features=[Field("daily_transactions", "double")], entities=[Field("customer_id", "int32")], max_age=86400, ) feature_table_df = filter_feature_table_by_time_range( feature_table_df, feature_table, "event_timestamp", entity_df, "event_timestamp", ) joined_df = as_of_join( entity_df, "event_timestamp", feature_table_df, feature_table, ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, ), ( 2001, 8002, datetime(year=2020, month=9, day=2), 400.0, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
encoding='gb18030') yjjd = yjjd.join(app, yjjd.realAppID == app.app_id, how='inner').select('sn', 'realAppID', 'ch_name', 'group_ch') yjjd = yjjd.distinct().dropna() yjjd_count = yjjd.count() shipin = yjjd.filter(yjjd.group_ch == '影音试听').count() gouwu = yjjd.filter(yjjd.group_ch == '网络购物').count() youxi = yjjd.filter(yjjd.group_ch == '网络游戏').count() ddd = [{ 'sn': '%s' % (n), 'qingxu': 0 if yjjd_count == 0 else (yjjd_count - shipin - gouwu - youxi) / yjjd_count }] ddd = spark.createDataFrame(ddd).select( 'sn', bround('qingxu', 2).alias('qingxu')) ddd = ddd.withColumnRenamed('sn', 'sn1') df = df.join(ddd, df.sn == ddd.sn1, how='inner') df = df.select('time', 'sn', 'count', 'avg_ht', 'avg_bi', 'std_ht', 'std_bi', 'qingxu') df = df.repartition(1) df.write.csv('/user/maxnet/ian/corp_index_1/%s_%s' % (n, i), header=True, compression='gzip', mode='overwrite') except: pass
def test_auto_mapper_concat_multiple_items_structs_different_elements_with_schema( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, None, "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df: DataFrame = source_df.select("member_id") df.createOrReplaceTempView("members") schema: StructType = StructType([ StructField("id", StringType(), True), StructField("c", StringType(), True), StructField("b", StringType(), True), ]) # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=AutoMapperList( [ AutoMapperDataTypeComplexBase(id_=A.column("first_name"), b=A.column("last_name")), ], children_schema=schema, ).concat( AutoMapperList( [ AutoMapperDataTypeComplexBase(id_=A.column("first_name"), c=A.column("last_name")), ], children_schema=schema, ))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") struct1 = struct( col("b.first_name").alias("id"), lit(None).alias("c"), col("b.last_name").alias("b"), ) struct2 = struct( col("b.first_name").alias("id"), col("b.last_name").alias("c"), lit(None).alias("b"), ) array1 = when( array(struct1).isNotNull(), filter(coalesce(array(struct1), array()), lambda x: x.isNotNull()), ) array2 = when( array(struct2).isNotNull(), filter(coalesce(array(struct2), array()), lambda x: x.isNotNull()), ) assert_compare_expressions(sql_expressions["dst2"], concat(array1, array2).alias("dst2")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [0] == "Imran") assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [2] == "Qureshi") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] [0] == "Michael") assert ( result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1] is None)
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark import SQLContext from pyspark.sql.types import * import pyspark.sql.functions as F from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs from pyspark.ml.tuning import CrossValidatorModel from pyspark.ml import PipelineModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator sc = SparkContext(appName="MyFirstApp4_Task_task2") spark = SparkSession(sc) df_node18=spark.read.format("parquet").load(path="hdfs://namenode:9000/example4/test.parquet") model_node21=CrossValidatorModel.load("hdfs://namenode:9000/example4/model_2/") model_node19=PipelineModel.load("hdfs://namenode:9000/example4/model_1/") df_node20=model_node19.transform(df_node18) df_node22=model_node21.transform(df_node20) evaluator_node23 = MulticlassClassificationEvaluator(labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") score_node23=evaluator_node23.evaluate(df_node22) df_node23= spark.createDataFrame([(score_node23,)], ["score"]) df_node23.write.format("csv").save(path="hdfs://namenode:9000/example4/EvalResult3.csv")
sqlContext = SQLContext(sc) df = spark.read.csv('file:////home/ubuntu/ys-180326/Dataset75.csv', header=True) data = df.rdd.map(list) print(data.first()) score = data.map(lambda s: 1.0 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0) comment = data.map(lambda s: s[3]) split_neg_data2 = score.zip(comment) tranform_data = split_neg_data2.map( lambda p: (p[0], p[1])) #.toDF()#.withColumnRenamed('_1','label') #tranform_data.show() #sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence") sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) #计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
def test_historical_feature_retrieval_with_mapping(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'column_mapping_test_entity.csv')}", "event_timestamp_column": "event_timestamp", "field_mapping": { "customer_id": "id" }, "options": { "inferSchema": "true", "header": "true" }, } } booking_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'column_mapping_test_feature.csv')}", "event_timestamp_column": "datetime", "created_timestamp_column": "created_datetime", "options": { "inferSchema": "true", "header": "true" }, } } booking_table = { "name": "bookings", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "total_bookings", "type": "int32" }], "max_age": 86400, } joined_df = retrieve_historical_features( spark, entity_source, [booking_source], [booking_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("bookings__total_bookings", IntegerType()), ]) expected_joined_data = [ (1001, datetime(year=2020, month=9, day=2), 200), (1001, datetime(year=2020, month=9, day=3), 200), (2001, datetime(year=2020, month=9, day=4), 600), (2001, datetime(year=2020, month=9, day=4), 600), (3001, datetime(year=2020, month=9, day=4), 700), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
default="predictions") args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) if args.format == "tfr": df = dfutil.loadTFRecords(sc, args.images) elif args.format == "csv": images = sc.textFile( args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile( args.labels).map(lambda ln: [int(float(x)) for x in ln.split(',')]) dataRDD = images.zip(labels) df = spark.createDataFrame(dataRDD, ['image', 'label']) else: raise Exception("Unsupported format: {}".format(args.format)) # Pipeline API if args.train: # train a model using Spark Estimator fitted to a DataFrame print("{0} ===== Estimator.fit()".format(datetime.now().isoformat())) # dummy tf args (from imagenet/inception example) tf_args = { 'initial_learning_rate': 0.045, 'num_epochs_per_decay': 2.0, 'learning_rate_decay_factor': 0.94 } estimator = TFEstimator(mnist_dist_pipeline.map_fun, args, export_fn=mnist_dist_pipeline.export_fun) \
def test_large_historical_feature_retrieval(spark: SparkSession, large_entity_csv_file: str, large_feature_csv_file: str): nr_rows = 1000 start_datetime = datetime(year=2020, month=8, day=31) expected_join_data = [(1000 + i, start_datetime + timedelta(days=i), i * 10) for i in range(nr_rows)] expected_join_data_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("feature__total_bookings", IntegerType()), ]) expected_join_data_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_join_data), expected_join_data_schema) entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{large_entity_csv_file}", "event_timestamp_column": "event_timestamp", "field_mapping": { "customer_id": "id" }, "options": { "inferSchema": "true", "header": "true" }, } } feature_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{large_feature_csv_file}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } feature_table = { "name": "feature", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "total_bookings", "type": "int32" }], "max_age": 86400, } joined_df = retrieve_historical_features(spark, entity_source, [feature_source], [feature_table]) assert_dataframe_equal(joined_df, expected_join_data_df)
LRModel.save("GoogleStockModel") return print("\nModel saved successfully!") elif X.lower() == "n": return print("\nModel not saved!") except: print("Invalid Input! Try Again!") try: print('\nStarting PySpark...') pdDataFrame = dataCleaner.df sc = SparkContext() sparkSession = SparkSession(sc) print('\nConverting Pandas DataFrame to PySpark DataFrame:') stockData = sparkSession.createDataFrame(pdDataFrame) print(stockData) print('\nPrinting Schema of PySpark DataFrame:') print(stockData.printSchema()) print("\nPerforming Descriptive Analytics Operations:") print(stockData.describe().toPandas().transpose()) print("\nSeperating the Open, High and Low:") featureAssembler = VectorAssembler(inputCols=["Open", "High", "Low"], outputCol="Features") output = featureAssembler.transform(stockData) print(output.show()) print("\nChecking the Vectorized Feature:") print(output.select("Features").show())
def test_implicit_type_conversion(spark: SparkSession, ): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'single_customer.csv')}", "event_timestamp_column": "event_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'transactions.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_table = { "name": "transactions", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "daily_transactions", "type": "float" }], "max_age": 86400, } joined_df = retrieve_historical_features( spark, entity_source, [transaction_source], [transaction_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), ]) expected_joined_data = [ ( 1001, datetime(year=2020, month=9, day=2), 100.0, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def test_automapper_nested_array_filter_with_parent_column( spark_session: SparkSession, ) -> None: schema = StructType( [ StructField("row_id", dataType=IntegerType(), nullable=False), StructField( "location", dataType=ArrayType( StructType( [ StructField("name", StringType(), True), ] ) ), ), StructField( "schedule", dataType=ArrayType( StructType( [ StructField("name", StringType(), True), StructField( "actor", ArrayType( StructType( [StructField("reference", StringType(), True)] ), True, ), ), ] ) ), ), StructField( "single_level", dataType=ArrayType( StructType( [ StructField("reference", StringType(), True), ] ) ), ), ] ) spark_session.createDataFrame( [ ( 1, [{"name": "location-100"}, {"name": "location-200"}], [ { "name": "schedule-1", "actor": [ {"reference": "location-100"}, {"reference": "practitioner-role-100"}, ], }, { "name": "schedule-2", "actor": [ {"reference": "location-200"}, {"reference": "practitioner-role-200"}, ], }, ], [{"reference": "location-100"}, {"reference": "location-200"}], ) ], schema, ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") mapper = AutoMapper( view="schedule", source_view="patients", keys=["row_id"] ).columns( location=A.column("location").select( AutoMapperElasticSearchLocation( name=A.field("name"), scheduling=A.nested_array_filter( array_field=A.column("schedule"), inner_array_field=A.field("actor"), match_property="reference", match_value=A.field("{parent}.name"), ).select_one(AutoMapperElasticSearchSchedule(name=A.field("name"))), ) ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) print("------COLUMN SPECS------") for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["location"], transform( col("b.location"), lambda l: ( struct( l["name"].alias("name"), transform( filter( col("b.schedule"), lambda s: exists( s["actor"], lambda a: a["reference"] == l["name"], # type: ignore ), ), lambda s: struct(s["name"].alias("name")), )[0].alias("scheduling"), ) ), ).alias("___location"), ) result_df: DataFrame = mapper.transform(df=source_df) # Assert # result_df.printSchema() # result_df.show(truncate=False) location_row = result_df.collect()[0].location for index, location in enumerate(location_row): location_name = location.name location_scheduling = location.scheduling assert location_name == f"location-{index + 1}00" assert len(location_scheduling) == 1 assert location_scheduling.name == f"schedule-{index + 1}"
def empty_integer_df(spark_session: SparkSession): return spark_session.createDataFrame([], schema=single_integer_column_schema)
parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) if args.format == 'tfr': # load TFRecords as a DataFrame df = dfutil.loadTFRecords(sc, args.images_labels) else: # args.format == 'csv': # create RDD of input data def parse(ln): vec = [int(x) for x in ln.split(',')] return (vec[1:], vec[0]) images_labels = sc.textFile(args.images_labels).map(parse) df = spark.createDataFrame(images_labels, ['image', 'label']) df.show() if args.mode == 'train': estimator = TFEstimator(main_fun, args) \ .setInputMapping({'image': 'image', 'label': 'label'}) \ .setModelDir(args.model_dir) \ .setExportDir(args.export_dir) \ .setClusterSize(args.cluster_size) \ .setTensorboard(args.tensorboard) \ .setEpochs(args.epochs) \ .setBatchSize(args.batch_size) \ .setGraceSecs(60) model = estimator.fit(df) else: # args.mode == 'inference':
user_index = create_user_index(ui_mat_rdd) doc_index = create_doc_index(ui_mat_rdd) b_uidx = sc.broadcast(user_index) b_didx = sc.broadcast(doc_index) ui_mat_rdd = ui_mat_rdd.map(lambda (usrId, docId, value): (b_uidx.value[ usrId], b_didx.value[docId], value)) num_users = ui_mat_rdd.map(lambda (usrId,docId,value): usrId) \ .distinct() \ .count() num_movies = ui_mat_rdd.map(lambda (usrId,docId,value): docId) \ .distinct() \ .count() print 'users:', num_users, 'products:', num_movies df = spark.createDataFrame(ui_mat_rdd, ['userId', 'movieId', 'value']) ui_mat_rdd.unpersist() print 'Splitting data set...' df = df.orderBy(F.rand()) train_df, test_df = df.randomSplit([0.9, 0.1], seed=45) train_df, val_df = train_df.randomSplit([0.95, 0.05], seed=45) train_df = train_df.withColumn('flag', F.lit(0)) val_df = val_df.withColumn('flag', F.lit(1)) val_df = val_df.union(train_df) test_df = test_df.withColumn('flag', F.lit(2)) test_df = test_df.union(train_df) test_df = test_df.union(val_df)
spark_session = SparkSession(sc) connector = pshc.PSHC(sc, sqlContext) catelog = { "table": { "namespace": "default", "name": "hb_text" }, "rowkey": "id", "columns": { "id": { "cf": "rowkey", "col": "key", "type": "string" }, "text_file": { "cf": "data", "col": "text_file", "type": "string" }, } } df = connector.get_df_from_hbase(catelog, repartition_num=1000).rdd.cache() print('======load file count=====', df.count()) result_rdd = df.mapPartitions(lambda x: save_to_hive(x)) result_df = spark_session.createDataFrame(result_rdd, ['id', 'text', 'index']) result_df.write.saveAsTable('abc.hb_text_oss_file', mode='overwrite') #print('======count=====', spark_session.table('abc.hb_text_oss_file').count())
def get_embryo_data(spark: SparkSession): r = requests.get("") df = spark.createDataFrame([json.loads(line) for line in r.iter_lines()]) return df
def get_table_snapshot(spark: SparkSession, s3_bucket: str, table: str, source: dict, processing_date: str, date_partition: bool = False) -> DataFrame: """ :param spark: existing Spark session :param s3_bucket: s3 bucket name :param table: name of the table to be created in the catalog :param source: dictionary with cdc source settings :param processing_date: string with date to generate snapshot :param date_partition: specify if the output should be a date partition :returns DataFrame: """ processing_date = parse_date(processing_date).date() if processing_date < source["cdc_start_date"]: raise ValueError( "processing_date must be after the source cdc_start_date") last_date = None table_exists = catalog_table_exists(spark, table) if table_exists: if date_partition: last_date = parse_date( get_table_partitions(spark, table)[-1].split("=")[1]).date() else: last_date = parse_date(get_current_version(spark, table)).date() if processing_date < last_date: raise ValueError("processing_date must be after last_partition") # Define a new schema with DMS specific columns from the source schema updates_schema = T.StructType().add("Op", "string").add("cdc_timestamp", "string") for column in source["schema"]: updates_schema.add(column) # Clean the temporary updates directory spark.createDataFrame([], updates_schema) \ .withColumn("file_number", F.lit(0)) \ .withColumn("increasing_id", F.monotonically_increasing_id()) \ .write.mode("overwrite").parquet(f"s3://{s3_bucket}/tmp/{table}_updates/") # Index update files into a temp folder to avoid loosing the # order to the records when Spark partitions the DataFrame def index_update_files(file_number: int, file_path: str, is_full_load: bool = False): df = spark.read.schema(updates_schema).parquet(file_path) \ .withColumn("increasing_id", F.monotonically_increasing_id()) \ .withColumn("file_number", F.lit(file_number)) if is_full_load: df = df.withColumn( "cdc_timestamp", F.lit(f"{str(processing_date)} 00:00:00.000000")) df.write.mode("append").parquet( f"s3://{s3_bucket}/tmp/{table}_updates/") # Index the initial full load files if the processing_date is equal to the CDC start date, # or the table does not exists. # Note: the cdc_timestamp is reset to avoid having CDC updates files with earlier timestamps if processing_date == source["cdc_start_date"] or not table_exists: full_load_files = get_cdc_files(processing_date, source["path"], full_load=True) run_multi_threaded_map( mapped_function=lambda args: index_update_files(*args), args_mapped_function=[ (number, path, True) for number, path in enumerate(full_load_files) ], thread_number=8) # Index the CDC update files between the last partition # created or the CDC start date and the processing_date. updates_files = get_cdc_files(processing_date, source["path"], last_date or source["cdc_start_date"]) run_multi_threaded_map( mapped_function=lambda args: index_update_files(*args), args_mapped_function=[(number, path) for number, path in enumerate(updates_files)], thread_number=64) # Generate a new snapshot from the indexed files window = Window.partitionBy(*source["primary_keys"]) \ .orderBy("cdc_timestamp", "file_number", "increasing_id") index_window = Window.partitionBy(*source["primary_keys"]) \ .orderBy(F.col("ordered_index").desc()) new_snapshot = spark.read.parquet(f"s3://{s3_bucket}/tmp/{table}_updates/") \ .withColumn("ordered_index", F.row_number().over(window)) \ .filter(F.to_date(F.col("cdc_timestamp")) <= str(processing_date)) \ .filter(F.to_date(F.col("cdc_timestamp")) >= str(last_date or source["cdc_start_date"])) \ .withColumn("row_number", F.row_number().over(index_window)) \ .filter(F.col("row_number") == 1) if date_partition: new_snapshot = new_snapshot.withColumn("dt", F.to_date(F.col(source['partition_date']))) \ .filter(F.col("dt") == str(processing_date)) # Generate an old_snapshot from the partitions that have to be updated # or get an empty DataFrame if the table does not exist final_snapshot_schema = T.StructType() for column in source["schema"]: final_snapshot_schema.add(column) if date_partition: final_snapshot_schema.add("dt", "date") old_snapshot = spark.createDataFrame([], final_snapshot_schema) if table_exists: if date_partition: old_snapshot = spark.read.table(table).filter( F.col("dt") == str(processing_date)) else: last_table_location = f"s3://{s3_bucket}/{table}/version={str(last_date)}/" old_snapshot = load_dataframe(spark, last_table_location, source["schema"]) # Merge both old and new snapshots to create a new snapshot # of the partition to be overwritten into the final table conditions = [ old_snapshot[name] == new_snapshot[name] for name in source["primary_keys"] ] fields = map( lambda field: F.coalesce(new_snapshot[field.name], old_snapshot[ field.name]).alias(field.name), final_snapshot_schema.fields) final_snapshot = old_snapshot.join(new_snapshot, conditions, how="outer") \ .filter(new_snapshot.Op.isNull() | (new_snapshot.Op != 'D')) \ .select(*fields) if date_partition: final_snapshot = final_snapshot.repartition("dt") return final_snapshot
class SCDHTest(testBase): def setUp(self): StockCustReturnByPrdInd.logLevel = 'debug' self.scdh = StockCustReturnByPrdInd(None) os.environ['SPARK_HOME'] = "/usr/local/Cellar/apache-spark/2.2.0/libexec" sys.path.append("/usr/local/Cellar/apache-spark/2.2.0/libexec/python") conf = SparkConf().setMaster("local").setAppName("hello") self.spark = SparkSession(SparkContext(conf=conf)) def tearDown(self): self.spark.stop() def test_local_spark(self): doc = self.spark.createDataFrame([['a', 'b', 'c'], ['b', 'd', 'd']]) print doc.show() print "successful!" def test_get_base_data(self): self.scdh._get_base_data("2017-03-16", "2017-03-18", 1, 5) def test_init_data(self): self.scdh.init_data() def test_daily_compute(self): self.scdh.daily_compute("2017-03-16", "2017-03-16") def test_check_1(self): sql = """ SELECT * from adatatest.stock_cust_daily_return where short_return_rate>1 or long_return_rate>1 or total_return_rate>1 """ self.spark.sql(sql) def test_travel_row(self): # """ # stock_cust_return_by_prd_ind.prd_ind unknown # stock_cust_return_by_prd_ind.return -44623.789999999964 # stock_cust_return_by_prd_ind.return_rate -0.006018969744297111 # stock_cust_return_by_prd_ind.trade_id 12466 # stock_cust_return_by_prd_ind.return_ratio 0.4610100448676952 # stock_cust_return_by_prd_ind.return_rank 2 # stock_cust_return_by_prd_ind.return_rate_rank 1 # stock_cust_return_by_prd_ind.busi_date 2017-03-23 # stock_cust_return_by_prd_ind.compute 7 # """ # spark.sql(""" # select trade_id,prd_ind,collect_list(detail_item) detail_list from ( # select trade_id,trim(prd_ind) prd_ind, # (str_to_map(concat( # 'pre_mkt_val:',pre_mkt_val, # ',now_mkt_val:',now_mkt_val, # ',pos_cash_flow:',pos_cash_flow, # ',neg_cash_flow:',pos_cash_flow, # ',exception_label:',exception_label, # ',trd_type:',trd_type, # ',return:',return, # ',busi_date:',busi_date),",",":")) detail_item # from adatatest.stock_cust_daily_holding # where busi_date<='2017-03-23' and trade_id='12466' and prd_ind='unknown' # ) a # GROUP by trade_id,prd_ind # """) r = Row(trade_id=u'12466', prd_ind=u'unknown', detail_list=[ {u'return': u'-13008.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1263402.0', u'now_mkt_val': u'1250394.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'6344.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'135176.0', u'now_mkt_val': u'141520.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-12803.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1308384.0', u'now_mkt_val': u'1295581.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-4.229999999999563', u'trd_type': u'long_related', u'pos_cash_flow': u'16940.23', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'16936.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'16940.23'}, {u'return': u'1612.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'208052.0', u'now_mkt_val': u'209664.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'35466.53', u'now_mkt_val': u'18526.3', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'4730.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'679400.0', u'now_mkt_val': u'684130.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-1662.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'271183.0', u'now_mkt_val': u'269521.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-693.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-130207.0', u'now_mkt_val': u'-130900.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-21138.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1284540.0', u'now_mkt_val': u'1263402.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'2079.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-132286.0', u'now_mkt_val': u'-130207.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'6771.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'128405.0', u'now_mkt_val': u'135176.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'306163.19', u'now_mkt_val': u'35466.53', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-12470.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'691870.0', u'now_mkt_val': u'679400.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-122.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'128527.0', u'now_mkt_val': u'128405.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'11.429999999999836', u'trd_type': u'long_related', u'pos_cash_flow': u'2273.57', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'2285.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'2273.57'}, {u'return': u'539.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-132825.0', u'now_mkt_val': u'-132286.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'8673.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1327382.0', u'now_mkt_val': u'1336055.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'15399.439999999944', u'trd_type': u'long_related', u'pos_cash_flow': u'1274560.56', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'1289960.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'1274560.56'}, {u'return': u'197.7399999999907', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'-132825.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'1510820.28', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'}, {u'return': u'12845.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1320466.0', u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'3497135.97', u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'2177000.0', u'now_mkt_val': u'3497135.97', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'}, {u'return': u'-17.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'2247.0', u'now_mkt_val': u'2230.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1317897.0', u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'}, {u'return': u'-38.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'2285.0', u'now_mkt_val': u'2247.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'5138.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1338449.0', u'now_mkt_val': u'-1333311.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-27671.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1336055.0', u'now_mkt_val': u'1308384.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'-2808.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'210860.0', u'now_mkt_val': u'208052.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'}, {u'return': u'486.3400000000256', u'trd_type': u'long_related', u'pos_cash_flow': u'270696.66', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'271183.0', u'busi_date': u'2017-03-22', u'neg_cash_flow': u'270696.66'}, {u'return': u'-2753.609999999986', u'trd_type': u'long_related', u'pos_cash_flow': u'694623.61', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'691870.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'694623.61'}, {u'return': u'-5420.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1289960.0', u'now_mkt_val': u'1284540.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'-2569.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1335880.0', u'now_mkt_val': u'-1338449.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'1510820.28', u'now_mkt_val': u'306163.19', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'}, {u'return': u'1299.6199999999953', u'trd_type': u'long_related', u'pos_cash_flow': u'209560.38', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'210860.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'209560.38'}, {u'return': u'-15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'-1320466.0', u'now_mkt_val': u'-1335880.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'}, {u'return': u'5451.600000000093', u'trd_type': u'long_related', u'pos_cash_flow': u'1321930.4', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'1327382.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'1321930.4'}, {u'return': u'150.9100000000035', u'trd_type': u'long_related', u'pos_cash_flow': u'128376.09', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'128527.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'128376.09'}, {u'return': u'-13175.030000000028', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0', u'now_mkt_val': u'-1333311.0', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'}]) r2 = _travel_row(r, '2017-03-23') self.assertTrue(int(r2.get("return")), -44623)
def metrics (session: SparkSession, dataframe: pyspark.sql.DataFrame, actual: str, predicted: str) -> pyspark.sql.DataFrame: ''' Calculates evaluation metrics from predicted results :param dataframe: spark.sql.dataframe with the real and predicted values :param actual: Name of column with observed target values :param predicted: Name of column with predicted values :return: ''' # Along each row are the actual values and down each column are the predicted dataframe = dataframe.withColumn(actual, col(actual).cast('integer')) dataframe = dataframe.withColumn(predicted, col(predicted).cast('integer')) cm = dataframe.crosstab(actual, predicted) cm = cm.sort(cm.columns[0], ascending=True) # Adds missing column in case just one class was predicted if not '0' in cm.columns: cm = cm.withColumn('0', lit(0)) if not '1' in cm.columns: cm = cm.withColumn('1', lit(0)) # Subsets values from confusion matrix zero = cm.filter(cm[cm.columns[0]] == 0.0) first_0 = zero.take(1) one = cm.filter(cm[cm.columns[0]] == 1.0) first_1 = one.take(1) tn = first_0[0][1] fp = first_0[0][2] fn = first_1[0][1] tp = first_1[0][2] # Calculate metrics from values in the confussion matrix if (tp == 0): acc = float((tp + tn) / (tp + tn + fp + fn)) sen = 0 spe = float((tn) / (tn + fp)) prec = 0 rec = 0 f1 = 0 elif (tn == 0): acc = float((tp + tn) / (tp + tn + fp + fn)) sen = float((tp) / (tp + fn)) spe = 0 prec = float((tp) / (tp + fp)) rec = float((tp) / (tp + fn)) f1 = 2 * float((prec * rec) / (prec + rec)) else: acc = float((tp + tn) / (tp + tn + fp + fn)) sen = float((tp) / (tp + fn)) spe = float((tn) / (tn + fp)) prec = float((tp) / (tp + fp)) rec = float((tp) / (tp + fn)) f1 = 2 * float((prec * rec) / (prec + rec)) # Print results print('Confusion Matrix and Statistics: \n') cm.show() print('True Positives:', tp) print('True Negatives:', tn) print('False Positives:', fp) print('False Negatives:', fn) print('Total:', dataframe.count(), '\n') print('Accuracy: {0:.2f}'.format(acc)) print('Sensitivity: {0:.2f}'.format(sen)) print('Specificity: {0:.2f}'.format(spe)) print('Precision: {0:.2f}'.format(prec)) print('Recall: {0:.2f}'.format(rec)) print('F1-score: {0:.2f}'.format(f1)) # Create spark dataframe with results l = [(acc, sen, spe, prec, rec, f1)] df = session.createDataFrame(l, ['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1']) return df
) print(Q8.show(50)) print( "-------------------------------Q9 ANSWER----------------------------------" ) data2 = data.withColumnRenamed('user id', 'user-id') #Create Dataframe for age group like : [Row(rang=5,ind='0-5')] i = 1 rng = [] while (i <= 80 / 5): rng.append((i * 5, str(i * 5 - 4) + str('-') + str(i * 5))) i = i + 1 RG = spark.createDataFrame(rng, ['rang', 'ind']) RG.createOrReplaceTempView('rnge') Q9prep = data2.withColumn("rating", data2["rating"].cast(IntegerType())).join( item, data2["item id"] == item["movie id"]) Q9prep2 = Q9prep.withColumnRenamed("movie id", "movie_id").withColumnRenamed( "item id", "item_id").join(user, user["user id"] == Q9prep["user-id"]).withColumnRenamed( "user id", "user_id").withColumn( "age", user["age"].cast(IntegerType())).withColumnRenamed( "Children's", "children").withColumnRenamed( "Film-Noir", "filmnoir").withColumnRenamed( "Sci-Fi", "scifi").drop("user-id") Q9prep2.createOrReplaceTempView("q9t")
class IonCentroidsGenerator(object): """ Generator of theoretical isotope peaks for all molecules in a database. Args ---------- sc : pyspark.SparkContext moldb_name : str isocalc: IsocalcWrapper """ def __init__(self, sc, moldb_name, isocalc): self._sc = sc self._moldb_name = moldb_name self._isocalc = isocalc self._sm_config = SMConfig.get_conf() self._parquet_chunks_n = 64 self._iso_gen_part_n = 512 self._spark_session = SparkSession(self._sc) self._ion_centroids_path = '{}/{}/{}/{}'.format(self._sm_config['isotope_storage']['path'], self._moldb_name, self._isocalc.sigma, self._isocalc.charge) self.ion_df = None self.ion_centroids_df = None def exists(self): """ Check if ion centroids saved to parquet """ if self._ion_centroids_path.startswith('s3a://'): cred_dict = dict(aws_access_key_id=self._sm_config['aws']['aws_access_key_id'], aws_secret_access_key=self._sm_config['aws']['aws_secret_access_key']) bucket, key = split_s3_path(self._ion_centroids_path) s3 = boto3.client('s3', **cred_dict) try: s3.head_object(Bucket=bucket, Key=key + '/ions/_SUCCESS') except ClientError: return False else: return True else: return Path(self._ion_centroids_path + '/ions/_SUCCESS').exists() def generate(self, isocalc, sfs, adducts): """ Generate isotopic peaks Args --- isocalc: IsocalcWrapper Cannot be a class field as Spark doesn't allow to pass 'self' to functions adducts: list """ logger.info('Generating molecular isotopic peaks') def calc_centroids(args): ion_i, sf, adduct = args mzs, ints = isocalc.ion_centroids(sf, adduct) if mzs is not None: return zip(repeat(ion_i), range(0, len(mzs)), map(float, mzs), map(float, ints)) else: return [] ion_df = pd.DataFrame([(i, sf, adduct) for i, (sf, adduct) in enumerate(sorted(product(sfs, adducts)))], columns=['ion_i', 'sf', 'adduct']).set_index('ion_i') ion_centroids_rdd = (self._sc.parallelize(ion_df.reset_index().values, numSlices=self._iso_gen_part_n) .flatMap(calc_centroids)) self.ion_centroids_df = (pd.DataFrame(data=ion_centroids_rdd.collect(), columns=['ion_i', 'peak_i', 'mz', 'int']) .sort_values(by='mz') .set_index('ion_i')) self.ion_df = ion_df.loc[self.ion_centroids_df.index.unique()] # Use when pandas DataFrames get way too big # ion_centroids_df = self._spark_session.createDataFrame(data=ion_centroids_rdd, # schema=self.ion_centroids_df_fields) # self.ion_centroids_df = (ion_centroids_df # .sort(ion_centroids_df.mz.asc()) # .coalesce(self._parquet_chunks_n)) def save(self): """ Save isotopic peaks """ logger.info('Saving peaks') centr_spark_df = self._spark_session.createDataFrame(self.ion_centroids_df.reset_index()) centr_spark_df.write.parquet(self._ion_centroids_path + '/ion_centroids', mode='overwrite') ion_spark_df = self._spark_session.createDataFrame(self.ion_df.reset_index()) ion_spark_df.write.parquet(self._ion_centroids_path + '/ions', mode='overwrite') def restore(self): logger.info('Restoring peaks') self.ion_df = self._spark_session.read.parquet( self._ion_centroids_path + '/ions').toPandas().set_index('ion_i') self.ion_centroids_df = self._spark_session.read.parquet( self._ion_centroids_path + '/ion_centroids').toPandas().set_index('ion_i') def sf_adduct_centroids_df(self): return self.ion_df.join(self.ion_centroids_df).set_index(['sf', 'adduct']) def centroids_subset(self, ions): """ Restore isotopic peaks dataframe only for the 'ions' Args --- ions: list of tuples Returns --- : pandas.DataFrame """ assert self.ion_df is not None ion_map = self.ion_df.reset_index().set_index(['sf', 'adduct']).ion_i ion_ids = ion_map.loc[ions].values return self.ion_centroids_df.loc[ion_ids].sort_values(by='mz') def generate_if_not_exist(self, isocalc, sfs, adducts): if not self.exists(): self.generate(isocalc=isocalc, sfs=sfs, adducts=adducts) self.save() else: self.restore() def ions(self, adducts): return (self.ion_df[self.ion_df.adduct.isin(adducts)] .sort_values(by=['sf', 'adduct']) .to_records(index=False))
cpsv_grava_aux = cpsv_grava.select('count').collect() cpsv_grava_aux[0] = str(cpsv_grava_aux[0]) cpsv_grava_aux2 = re.sub(r'[^0-9]', '', ''.join(cpsv_grava_aux)) cpsv_grava_counted = int(cpsv_grava_aux2) cpsv_hielo_aux = cpsv_hielo.select('count').collect() cpsv_hielo_aux[0] = str(cpsv_hielo_aux[0]) cpsv_hielo_aux2 = re.sub(r'[^0-9]', '', ''.join(cpsv_hielo_aux)) cpsv_hielo_counted = int(cpsv_hielo_aux2) cpsv_seca_aux = cpsv_seca.select('count').collect() cpsv_seca_aux[0] = str(cpsv_seca_aux[0]) cpsv_seca_aux2 = re.sub(r'[^0-9]', '', ''.join(cpsv_seca_aux)) cpsv_seca_counted = int(cpsv_seca_aux2) df_result = spark.createDataFrame( [("Condiciones Meteorologicas: Granizo", cpfa_granizo_counted), ("Condiciones Meteorologicas: Hielo", cpfa_hielo_counted), ("Condiciones Meteorologicas: Niebla", cpfa_nieve_counted), ("Condiciones Meteorologicas: Seco y Despejado", cpfa_seco_counted), ("Condiciones Meteorologicas: Nieve", cpfa_nieve_counted), ("Condiciones de la Via: Mojada", cpsv_mojada_counted), ("Condiciones de la Via: Derrape por aceite", cpsv_aceite_counted), ("Condiciones de la Via: Derrape por barro", cpsv_barro_counted), ("Condiciones de la Via: Via con grava", cpsv_grava_counted), ("Condiciones de la Via: Derrape por hielo", cpsv_hielo_counted), ("Condiciones de la Via: Siniestro en via seca y despejada", cpsv_seca_counted)], schema) df_result.orderBy(df_result["Number of accidents"].desc()).show( df_result.count(), False)