def test_sanitize(self): new_cols = sanitize(self.sanitize_arr) self.assertListEqual(new_cols, [ "having_ip_address", "url_length", "shortining_service", "having_at_symbol", "double_slash_redirecting", "prefix_suffix", "having_sub_domain", "domain_registeration_length", "favicon", "port", "https_token", "request_url", "url_of_anchor", "links_in_tags", "sfh", "submitting_to_email", "abnormal_url", "redirect", "on_mouseover", "right_click", "pop_up_widnow", "iframe", "age_of_domain", "dns_record", "web_traffic", "page_rank", "google_index", "links_pointing_to_page", "statistical_report", "result", "485_5468a44_44_4_e3_c_cc_c_d", ])
pd.set_option('display.width', 1000) from sqlalchemy import create_engine import matplotlib.pylab as plt import matplotlib.dates as mdates plt.ioff() connection = create_engine( "mysql://*****:*****@etcinsights.nazwa.pl/etcinsights_ws" ).connect() mall_cols = connection.execute("select * from mall") mall = connection.execute("select * from mall").fetchall() print("Data extracted from sales_forecast successfully!") connection.close() mall = pd.DataFrame(mall) mall.columns = [col for col in mall_cols.keys()] mall.columns = cleaner.sanitize(mall.columns) mall['traffic'] = mall['traffic'].astype(int) mall['date'] = pd.to_datetime(mall['date'], format="%Y-%m-%d") mall = mall.groupby(by=['year', 'week'])['traffic'].sum().reset_index() # make graph of total fig, ax = plt.subplots(figsize=(16, 9)) ax.set(xlabel='date', ylabel='people in') ax.grid(True) ax.xaxis.set_tick_params(rotation=90) ax.plot(mall['week'] + "-" + mall['year'], mall['traffic'], marker='o') plt.tight_layout() fig.show() mall_rf_data = mall[ mall['date'] >= datetime.datetime.strptime('2019-07-29', "%Y-%m-%d")]
import pmdarima as pm print(f"Using pmdarima {pm.__version__}") # pd.set_option('display.max.columns', 20) # pd.set_option('display.width', 1000) connection = create_engine( "mysql://*****:*****@etcinsights.nazwa.pl/etcinsights_harebakken" ).connect() traffic_cols = connection.execute("select * from traffic_date") traffic = connection.execute("select * from traffic_date").fetchall() print("Data extracted from sales_forecast successfully!") connection.close() traffic = pd.DataFrame(traffic) traffic.columns = [col for col in traffic_cols.keys()] traffic.columns = cleaner.sanitize(traffic.columns) traffic = traffic.filter(['date', 'traffic']) traffic.set_index(['date'], inplace=True) traffic.index = pd.to_datetime(traffic.index) traffic['traffic'] += 0.001 # result = seasonal_decompose(traffic, model='multiplicative', period=7) # result.plot() # plt.show() pm.plot_acf(traffic) stepwise_model = auto_arima(traffic, start_p=1, start_q=1, max_p=5, max_q=5,
data_req=requests.get('https://maxbo.link.express/external/api/v2/5d02982d29512bcc1729bb3964efb830/sales/query/?start_date='+date+'T00:00:00&end_date='+date+'T23:59:59&store_alias=ALL&type=CASH').json() data_date=pd.DataFrame() for i in range(len(data_req['store'])): store_name=data_req['store'][i]['store_name'] sales_count=data_req['store'][i]['salesCount'] print("{0} - {1} from date {2}".format(i+1,store_name,date)) data_temp = pd.DataFrame() for j in range(len(data_req['store'][i]['sales'])): data_level_down=data_req['store'][i]['sales'][j] temp=pd.DataFrame(data_level_down['lineItems']).assign(store_name=store_name,sales_count=sales_count, date=date, id_tr=str(i+1)+date.replace('-','')+str(j+1)) # id identify the transaction store+date+transaction data_temp=pd.concat([data_temp, temp], axis=0) data_date=pd.concat([data_date, data_temp], axis=0) data = pd.concat([data, data_date], axis=0) data.drop(columns=['edpNr', 'productId', 'gross', 'discountPercent', 'itemCount'], inplace=True) data.reset_index(drop=True, inplace=True) data.columns=cleaner.sanitize(data.columns) # cleaning the data headers # send data the file named 'sales_data' file = open('sales_data', 'wb') pickle.dump(data, file) file.close() # load the data file = open('sales_data', 'rb') data = pickle.load(file) file.close() def original_name(part_name): # extract exactly the same name of vendor_name, which we know only partially return data.dropna()[data.dropna()['vendor_name'].str.contains(part_name)]['vendor_name'].unique().tolist() original_name("Jordan")
var_name='Day', value_name='Date') traffic_sales_data = pd.read_excel( 'traffic_sales_data/traffic_sales_2018_2019_2020.xlsx') traffic_sales_data = traffic_sales_data.melt(id_vars=['Week', 'Year', 'N_P'], value_vars=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ], var_name='Day', value_name='Value') traffic_sales_data = traffic_sales_data.merge(calendar_weeks, how='left', on=['Year', 'Week', 'Day']) traffic_sales_data.columns = cleaner.sanitize(traffic_sales_data.columns) traffic_sales_data_date = traffic_sales_data[ traffic_sales_data['n_p'] == "TRAFFIC"].rename( columns={'value': 'people_in'})[['people_in', 'date']] traffic_sales_data_date['people_in'] = traffic_sales_data_date[ 'people_in'].astype(int) data_harebakken_date = data_harebakken.groupby( by=['date'])['people_in'].sum().reset_index() traffic_date = pd.concat([traffic_sales_data_date, data_harebakken_date]) traffic_date['date'] = pd.to_datetime(traffic_date['date'], format="%Y-%m-%d") traffic_date.sort_values(by=["date"], inplace=True) traffic_date['week'] = traffic_date['date'].dt.strftime("%V") traffic_date['month'] = traffic_date['date'].dt.strftime("%B")