import numpy as np import pandas as pd import time import sys tic0 = time.perf_counter() ##---------------------------------------------------------------------------------------- ## Logistic Regression with SGD ##---------------------------------------------------------------------------------------- sample_size = 5000 p = 50 partition_method = "systematic" partition_num = 20 data_pdf = simulate_logistic(sample_size, p, partition_method, partition_num) data_sdf = spark.createDataFrame(data_pdf) memsize = sys.getsizeof(data_pdf) assembler = VectorAssembler(inputCols=["x" + str(x) for x in range(p)], outputCol="features") tic = time.perf_counter() parsedData = assembler.transform(data_sdf) time_parallelize = time.perf_counter() - tic tic = time.perf_counter() # Model configuration lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)
sample_size_sub = [] memsize_sub = [] # Read or load data chunks into pandas #----------------------------------------------------------------------------------------- time_2sdf_sub = [] time_repartition_sub = [] loop_counter = 0 for file_no_i in range(n_files): tic_2sdf = time.perf_counter() if using_data == "simulated_pdf": if file_no_i == 0: # To test performance, we only simulate one subset of data and replicated it. data_pdf_i = simulate_logistic(sample_size_sub[0], p, partition_method, partition_num_sub) memsize_sub0 = sys.getsizeof(data_pdf_i) else: sample_size_sub.append(sample_size_sub[0]) memsize_sub.append(memsize_sub0) partition_num_sub.append(partition_num_sub[0]) elif using_data == "real_pdf": # Read real data data_pdf_i0 = clean_airlinedata(os.path.expanduser( file_path[file_no_i]), fit_intercept=fit_intercept) # Create an full-column empty DataFrame and resize current subset edf = pd.DataFrame( columns=list(set(dummy_column_names) - set(data_pdf_i0.columns))) data_pdf_i = data_pdf_i0.append(edf, sort=True)