def compute_features(data, output_table, connection_string): results_sql = revo.RxSqlServerData(table=output_table, connection_string=connection_string) revo.rx_data_step(input_data=data, output_file=results_sql, overwrite=True, transform_function=featurize_transform)
def gather_image_paths(images_folder, label_to_number_dict, connection_string, mode="train"): table = None if mode == "train": table = "TrainData" elif mode == "val": table = "ValData" elif mode == "test": table = "TestData" else: raise ValueError("'mode' not recognized") query = 'SELECT ([file_stream].GetFileNamespacePath()) as image FROM [{}] WHERE [is_directory] = 0 AND file_type = \'png\''.format( table) filetable_sql = revo.RxSqlServerData(sql_query=query, connection_string=connection_string) data = revo.rx_import(filetable_sql) data["image"] = data["image"].apply(lambda x: os.path.join( images_folder, x[1:])) # TODO: assert to confirm paths exist data["class"] = data["image"].map( lambda x: os.path.basename(os.path.dirname(x))) data["label"] = data["class"].map(lambda x: label_to_number_dict[x]) print(data.iloc[0, 0]) return data
def DoInMemoryDivision_SQLPlayground(): start_time = time.time() #define the strings for connection conn_str = 'Driver=SQL Server;Server=SANDY;Database=S19SQLPlayground;Trusted_Connection=True;' customer_query = '''SELECT * FROM dbo.Customer;''' product_query = '''SELECT * FROM dbo.Product;''' purchase_query = '''SELECT * FROM dbo.Purchase;''' input_query = [customer_query, product_query, purchase_query] table_name = ['customer_data', 'product_data', 'purchase_data'] #try connection and write query to build the dataframe connected = True try: for i in range(len(input_query)): data_source = revoscale.RxSqlServerData(sql_query=input_query[i], connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) #globals()[name] create new variable globals()[table_name[i]] = pd.DataFrame( revoscale.rx_import(data_source)) except: print( "Cannot connect to DataBase and create the data table, existing..." ) connected = False if connected == False: exit(0) #iterate 3 tables, either add a new entry to the custExistence dictionary or update the count custExistence = {} for cu_index, cu_row in customer_data.iterrows(): for pr_index, pr_row in product_data.iterrows(): for pu_index, pu_row in purchase_data.iterrows(): if pu_row["CustomerId"] == cu_row["CustomerId"] and pu_row[ "ProductId"] == pr_row["ProductId"]: currentcustomer = pu_row["CustomerId"] if currentcustomer in custExistence: custExistence[currentcustomer] += 1 else: custExistence[currentcustomer] = 1 #print(custExistence) #Count all the product (count the row) nbProducts = product_data.shape[0] #print(nbProducts) #print out the customer who has bought all the products for customerid in custExistence: if custExistence[customerid] == nbProducts: print("CustomerId : " + str(customerid)) end_time = time.time() print("This took " + str(end_time - start_time) + " s")
def perform_clustering(): conn_str = 'Driver=SQL Server;Server=DESKTOP-NLR1KT5;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, COUNT(distinct(ss_ticket_number)) AS orders_count, COUNT(ss_item_sk) AS orders_items, SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, count(distinct(sr_ticket_number)) as returns_count, COUNT(sr_item_sk) as returns_items, SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' column_info = { "customer": { "type": "integer" }, "orderRatio": { "type": "integer" }, "itemsRatio": { "type": "integer" }, "frequency": { "type": "integer" } } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) customer_data = pd.DataFrame(revoscale.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) cdata = customer_data K = range(1, 20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) dist = (np.min(D, axis=1) for D in D_k) avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average within-cluster sum of squares') plt.title('Elbow for KMeans clustering') plt.show() n_clusters = 4 means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] est = means_cluster.fit(customer_data[columns]) clusters = est.labels_ customer_data['cluster'] = clusters for c in range(n_clusters): cluster_members = customer_data[customer_data['cluster'] == c][:] print('Cluster{}(n={}):'.format(c, len(cluster_members))) print('-' * 17) print(customer_data.groupby(['cluster']).mean())
from scipy.spatial import distance as sci_distance from sklearn import cluster as sk_cluster # Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=MS;Database=ML;Trusted_Connection=True;' input_query = '''SELECT * FROM [OnlineSales].[RFM]''' # Define the columns we wish to import. column_info = { "customer": {"type": "integer"}, "orderRatio": {"type": "integer"}, "itemsRatio": {"type": "integer"}, "frequency": {"type": "integer"} } data_source = revoscalepy.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscalepy.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscalepy.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) print(customer_data.describe()) cdata = customer_data K = range(1, 20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) dist = (np.min(D, axis=1) for D in D_k) avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-')
def perform_clustering(): ################################################################################################ ## Connect to DB and select data ################################################################################################ # Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=DESKTOP-CF2TNBV;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, -- return order ratio COUNT(distinct(ss_ticket_number)) AS orders_count, -- return ss_item_sk ratio COUNT(ss_item_sk) AS orders_items, -- return monetary amount ratio SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, -- return order ratio count(distinct(sr_ticket_number)) as returns_count, -- return ss_item_sk ratio COUNT(sr_item_sk) as returns_items, -- return monetary amount ratio SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' # Define the columns we wish to import. column_info = { "customer": { "type": "integer" }, "orderRatio": { "type": "integer" }, "itemsRatio": { "type": "integer" }, "frequency": { "type": "integer" } } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) ################################################################################################ ## Determine number of clusters using the Elbow method ################################################################################################ cdata = customer_data K = range(1, 20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) dist = (np.min(D, axis=1) for D in D_k) avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average within-cluster sum of squares') plt.title('Elbow for KMeans clustering') plt.show() ################################################################################################ ## Perform clustering using Kmeans ################################################################################################ # It looks like k=4 is a good number to use based on the elbow graph. n_clusters = 4 means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] est = means_cluster.fit(customer_data[columns]) clusters = est.labels_ customer_data['cluster'] = clusters # Print some data about the clusters: # For each cluster, count the members. for c in range(n_clusters): cluster_members = customer_data[customer_data['cluster'] == c][:] print('Cluster{}(n={}):'.format(c, len(cluster_members))) print('-' * 17) # Print mean values per cluster. print(customer_data.groupby(['cluster']).mean())
def DoInMemoryDivision(): # Connection string to connect to local DB conn_str = 'Driver=SQL Server;Server=DESKTOP-M98I69D;Database=S19SQLPlayground_Seb;Trusted_Connection=True;' # Queries to retrieve the Customers, Products, and purchases data respectively query_customers = '''SELECT * FROM dbo.Customer''' query_products = '''SELECT * FROM dbo.Product''' query_purchases = '''SELECT * FROM dbo.Purchase''' # Define the columns we wish to import for Customers, Products, and purchases data respectively col_cust_info = { "Customer": { "type": "integer" }, "FirstName": { "type": "str" }, "LastName": { "type": "str" } } col_prod_info = { "Product": { "type": "integer" }, "Description": { "type": "str" }, "UnitPrice": { "type": "integer" } } col_pur_info = { "Customer": { "type": "integer" }, "Product": { "type": "integer" }, "Quantity": { "type": "integer" } } # Start Timer to measure execution time start = timer() data_source_cust = revoscale.RxSqlServerData(sql_query=query_customers, column_Info=col_cust_info, connection_string=conn_str) data_source_prod = revoscale.RxSqlServerData(sql_query=query_products, column_Info=col_prod_info, connection_string=conn_str) data_source_pur = revoscale.RxSqlServerData(sql_query=query_purchases, column_Info=col_pur_info, connection_string=conn_str) # retrieve the data sources revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=3, auto_cleanup=False) # import data sources and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source_cust)) product_data = pd.DataFrame(revoscale.rx_import(data_source_prod)) purchase_data = pd.DataFrame(revoscale.rx_import(data_source_pur)) # Create a DataFrame to store customer purchase count cust_dict = {'CustId': [], 'PurchaseCount': []} cust_Purc_Count = pd.DataFrame(cust_dict) # Nb of products nb_products = len(product_data.index) # Division process # Iterate over all customers by df rows for cindex, crow in customer_data.iterrows(): # Iterate over all products by df rows for pindex, prow in product_data.iterrows(): # Iterate over all purchases by df rows for hindex, hrow in purchase_data.iterrows(): # Check whether the currentCustomer has bought the current product # print (crow['CustomerId'], hrow['CustomerId'], prow['ProductId'] ) if (crow['CustomerId'] == hrow['CustomerId']) and (prow['ProductId'] == hrow['ProductId']): # Either add a new DataRow in custPurchasesCount # if no existing for the current customer or update the count custExistence = 0 for index, cust_count in cust_Purc_Count.iterrows(): if cust_count['CustId'] == crow['CustomerId']: custExistence += 1 if (custExistence == 0): # First product for this customer td = {'CustId': crow['CustomerId'], 'PurchaseCount': 1} # Make a dataframe to be able to append to cust_Purc_Count tdf = pd.DataFrame(td, index=[0]) # Append to cust_Purc_Count # We must use the syntax below because for df append doesn't occur in place but in the object returned by the method cust_Purc_Count = cust_Purc_Count.append(tdf) else: # The product is already referenced cust_Purc_Count => Just increase the PurchaseCount cust_count['PurchaseCount'] += 1 for dindex, drow in cust_Purc_Count.iterrows(): # Check the customers that bought all the products if drow['PurchaseCount'] == nb_products: print('\nDivision CustomerId: ' + str(drow['CustId'])) end = timer() print("The In Memory Division in Python took (S) : ", end - start)
def perform_clustering(): ################################################################################################ ## Connect to DB and select data ################################################################################################ # Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=DESKTOP-INU1SHI;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, -- return order ratio COUNT(distinct(ss_ticket_number)) AS orders_count, -- return ss_item_sk ratio COUNT(ss_item_sk) AS orders_items, -- return monetary amount ratio SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, -- return order ratio count(distinct(sr_ticket_number)) as returns_count, -- return ss_item_sk ratio COUNT(sr_item_sk) as returns_items, -- return monetary amount ratio SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' # Define the columns we wish to import. column_info = { "customer": {"type": "integer"}, "orderRatio": {"type": "integer"}, "itemsRatio": {"type": "integer"}, "frequency": {"type": "integer"} } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) ComputeContext = revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source)) print("Data frame:", customer_data.head(n=5)) ################################################################################################ ## Determine number of clusters using the Elbow method ################################################################################################ cdata = customer_data K = range(1,20) KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) centroids = (k.cluster_centers_ for k in KM) D_K = (sci_distance.cdist(cdata,cent,'euclidean')for cent in centroids) dist = (np.min(D, axis=1)for D in D_K) avgWithinSS = [sum(d)/cdata.shape[0] for d in dist] plt.plot(K, avgWithinSS, 'b*-') plt.grid(True) plt.xlabel('Number of Clusters') plt.ylabel('Average Within Clusters Sum of Squares') plt.title('Kmeans Clustering') plt.show()
def perform_clustering(): ################################################################################################ ## Connect to DB and select data ################################################################################################ # Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=VC5-SOPHIA;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, -- return order ratio COUNT(distinct(ss_ticket_number)) AS orders_count, -- return ss_item_sk ratio COUNT(ss_item_sk) AS orders_items, -- return monetary amount ratio SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, -- return order ratio count(distinct(sr_ticket_number)) as returns_count, -- return ss_item_sk ratio COUNT(sr_item_sk) as returns_items, -- return monetary amount ratio SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' # Define the columns we wish to import. column_info = { "customer": { "type": "integer" }, "orderRatio": { "type": "float" }, "itemsRatio": { "type": "float" }, "frequency": { "type": "integer" } } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source)) #print("Data frame:", customer_data.head(n=5)) cdata = customer_data n_clusters = 4 means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] est = means_cluster.fit(customer_data[columns]) clusters = est.labels_ customer_data['cluster'] = clusters # Print some data about the clusters: # For each cluster, count the members. for c in range(n_clusters): cluster_members = customer_data[customer_data['cluster'] == c][:] print('Cluster{}(n={}):'.format(c, len(cluster_members))) print('-' * 17) # Print mean values per cluster. print(customer_data.groupby(['cluster']).mean())