def plot_stock_with_sma(): data = read_csv(path.join("StockData", "SPY.csv"), dateKey, [dateKey, closeKey]) data[sma50Key] = data[closeKey].rolling(50).mean() data[sma100Key] = data[closeKey].rolling(100).mean() stockDataKeys = [closeKey, sma50Key] plot_graph(data[stockDataKeys], stockDataKeys, "Time", "Price")
def plot_correlation(stock_one_ticker, stock_two_ticker): spyData = read_csv(path.join("StockData", r"%s.csv" % stock_one_ticker), dateKey, [dateKey, closeKey]) aaplData = read_csv(path.join("StockData", r"%s.csv" % stock_two_ticker), dateKey, [dateKey, closeKey]) corData = calculate_corr(spyData, aaplData, 50)[-100:] plot_graph(corData, ["%s and %s cor" % (stock_one_ticker, stock_two_ticker)])
def fp_growth_retail(TOP_PERCENTAGE, file_name, no_of_trx): data = pd.read_csv('../Datasets/' + str(file_name) + '.csv', header=None) print("\n --- FP Growth on File " + str(file_name) + " : and Top Percentage: " + str(TOP_PERCENTAGE)) # converting into required format of TransactionEncoder() trans = [] for i in range(0, no_of_trx): trans.append([str(data.values[i, j]) for j in range(0, 20)]) Items = dict(collections.Counter([x for sublist in trans for x in sublist])) Items['nan'] = 0 print("Frequencies of Each Item:") print(Items) top_items = top_x_per_products(Items, TOP_PERCENTAGE) print("Top Items:") print(top_items) plot_graph(top_items, 'fp_growth', TOP_PERCENTAGE) Output = [b for b in trans if any(a in b for a in top_items.keys())] # Using TransactionEncoder trans = np.array(trans) Output = np.array(Output) # print(Output.shape) t = TransactionEncoder() data = t.fit_transform(Output) data = pd.DataFrame(data, columns=t.columns_, dtype=int) # print(data.shape) # here we also find nan as one of the columns so lets drop that column data.drop('nan', axis=1, inplace=True) # print(data.shape) # print(data.head()) # running the fpgrowth algorithm res = fpgrowth(data, min_support=0.01, use_colnames=True) print("Number of Frequent Item sets:" + str(len(res))) res = association_rules(res, metric="confidence", min_threshold=0.5) print("\n=============== ASOCIATION RULES ======================") cols = [0, 1, 4, 5] res = res[res.columns[cols]] print(res)
def start_apriori(top_percentage): store_data = pd.read_csv('../Datasets/BreadBasket.csv') # lets visualize which items are more popular. Items = {} for item in store_data['Item']: if item in Items: Items[item] = Items[item] + 1 else: Items[item] = 1 print(Items) print(len(Items)) top_items = top_x_per_products(Items,top_percentage) print(top_items) top_item_set = set(top_items.keys()) print(top_item_set) plot_graph(top_items) store_data['D'] = store_data.Item.isin(top_item_set).astype(int) store_data['D'].apply(lambda x: 1 if x in top_item_set else 0) store_data = store_data[store_data['D'] == 1] print(store_data) store_data['Quantity'] = 1 # print(store_data.head(7)) basket = store_data.groupby(['Transaction', 'Item'])['Quantity'].sum().unstack().fillna(0) # There are a lot of zeros in the data but we also need to make sure any positive values are converted to a 1 # and anything less the 0 is set to 0. This step will complete the one hot encoding of the data def encode_units(x): if x <= 0: return 0 if x >= 1: return 1 basket_sets = basket.applymap(encode_units) print("Basket Data Shape:"+str(basket_sets.shape)) frequent_itemsets = apriori(basket_sets, min_support=0.02, use_colnames=True) print(frequent_itemsets) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5) print(rules) with open('table.txt', 'w') as f: f.write(tabulate(rules))
def plot_sma_crossover_vs_buy_and_hold_strategy_comparison(): data = read_csv(path.join("StockData", "SPY.csv"), dateKey, [dateKey, closeKey]) data[sma50Key] = data[closeKey].rolling(50).mean() data[sma100Key] = data[closeKey].rolling(100).mean() data[positionKey] = np.where(data[sma50Key] > data[sma100Key], 1, 0) data[positionKey] = data[positionKey].shift(1) data[strategyPctKey] = data[closeKey].pct_change(1) * data[positionKey] data[strategyKey] = (data[strategyPctKey] + 1).cumprod() data[buyHoldKey] = (data[closeKey].pct_change(1) + 1).cumprod() strategyDataKeys = [strategyKey, buyHoldKey] plot_graph(data[strategyDataKeys], strategyDataKeys, "Time", "Returns")
def apriori_retail_dataset(TOP_PERCENTAGE): store_data = pd.read_csv('../Datasets/Market_Basket_Optimisation.csv', header=None) records = [] for i in range(0, 7501): records.append([str(store_data.values[i, j]) for j in range(0, 20)]) Items = dict( collections.Counter([x for sublist in records for x in sublist])) del Items['nan'] print("Frequencies of Each Item:") print(Items) top_items = top_x_per_products(Items, TOP_PERCENTAGE) print("Top Items:") print(top_items) plot_graph(top_items, 'apriori', TOP_PERCENTAGE) Output = [b for b in records if any(a in b for a in top_items.keys())] association_rules = apriori(Output, min_support=0.01, min_confidence=0.5, min_lift=2, min_length=1) association_results = list(association_rules) # print(association_results) print("\n=============== ASOCIATION RULES ======================") for item in association_results: # first index of the inner list # Contains base item and add item if 'nan' not in list(item[2][0][0]) and 'nan' not in list( item[2][0][1]): print("Rule: " + str(list(item[2][0][0])) + " -> " + str(list(item[2][0][1]))) # second index of the inner list print("Support: " + str(item[1])) # third index of the list located at 0th # of the third index of the inner list print("Confidence: " + str(item[2][0][2])) print("Lift: " + str(item[2][0][3])) print("=====================================")