def make_predictions(data): # if os.path.exists('models/{0}_{1}.sav'.format(project_name, standard_service)): # # TODO Load model and predict # return new_data = pd.DataFrame(index=range(0,len(data)),columns=['Date', 'Total Count']) for i in range(0, len(data)): new_data['Date'][i] = data.index[i] new_data['Total Count'][i] = data['Total Count'][i] add_datepart(new_data, 'Date') new_data.drop(['Dayofyear', 'Dayofweek', 'Elapsed', 'Is_quarter_end', 'Week', 'Is_month_end', 'Is_month_start', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'], axis=1, inplace=True) train_size = len(new_data) train = new_data[:train_size] x_train = train.drop('Total Count', axis=1) y_train = train['Total Count'] model = LinearRegression() model.fit(x_train, y_train) # TODO Save model test = pd.date_range(data.index[-1], data.index[-1] + timedelta(days=365), freq='D') test = pd.DataFrame(test, columns=['Date']) test_date = pd.date_range(data.index[-1], data.index[-1] + timedelta(days=365), freq='D') test_date = pd.DataFrame(test, columns=['Date']) add_datepart(test, 'Date') test.drop(['Dayofyear', 'Dayofweek', 'Elapsed', 'Is_quarter_end', 'Week', 'Is_month_end', 'Is_month_start', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'], axis=1, inplace=True) predictions = model.predict(test) test['Predictions'] = predictions test.index = test_date.Date return test
def preprocess(inp_df: pd.DataFrame) -> pd.DataFrame: """Preprocess the dataframe for modeling. The data, along with the data from the gather_args() function will get passed to either the training or prediction method. Inputs: a raw dataframe Output: a processed dataframe to pass to .train() or .get_preds() """ df = inp_df.copy() # Sort by date since we have a timeseries df.sort_values(by=['date', 'store'], inplace=True) # Drop week_start and day_of_week since add_datepart() will do that df.drop('week_start', axis='columns', inplace=True) df.drop('day_of_week', axis='columns', inplace=True) # If our whole df has sales == 0, it must be a single-row df used for a # single prediction, so just take the first row if (df.sales == 0).all(): df = df.iloc[0] else: # Drop any sales == 0 since they'll mess up rmspe (div by zero) df = df[df.sales != 0] tabular.add_datepart(df, 'date', drop=True, time=False) return df
def LinearRegression(df, type, split): #creating dataframe with date and the target variable data = df.sort_index(ascending=True, axis=0) new_data = pd.DataFrame(index=range(0, len(df)), columns=['Date', type]) for i in range(0, len(data)): new_data['Date'][i] = data['Date'][i] new_data[type][i] = data[type][i] #create features from fastai.tabular import add_datepart add_datepart(new_data, 'Date') new_data.drop('Elapsed', axis=1, inplace=True) #elapsed will be the time stamp new_data['mon_fri'] = 0 for i in range(0, len(new_data)): if (new_data['Dayofweek'][i] == 0 or new_data['Dayofweek'][i] == 4): #如果是星期一或星期五 new_data['mon_fri'][i] = 1 else: new_data['mon_fri'][i] = 0 #split into train and validation train = new_data[:split] valid = new_data[split:] x_train = train.drop(type, axis=1) y_train = train[type] x_valid = valid.drop(type, axis=1) y_valid = valid[type] #implement linear regression from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(x_train, y_train) preds = model.predict(x_valid) rmse = np.sqrt(np.mean(np.power((np.array(y_valid) - np.array(preds)), 2))) st.write('RMSE value on validation set:') st.write(rmse) valid['Predictions'] = 0 valid['Predictions'] = preds valid.index = new_data[split:].index train.index = new_data[:split].index append_data = DataFrame(data={type: [], 'Predictions': []}) append_data[type] = train[type] append_data['Predictions'] = train[type] pic = pd.concat( [append_data[[type, 'Predictions']], valid[[type, 'Predictions']]], axis=0) st.line_chart(pic)
def pred_single(date, prev, learn=infer): print(f'Getting predictions for date {date} with prev closing price of {prev}') df = pd.DataFrame(dict(Date=date, prev=prev), index=[0]) add_datepart(df, 'Date') pred = learn.predict(df.iloc[0]) print(pred) res = round(np.exp(pred[0].data.item()), 2) print(res) return res
def MLPRegression(data, startAt, stopAt=None, **kwargs): """ Applies the Multi-Layer Perceptron regression to data to forecast values between startAt and stopAt. If stopAt is not provided, forecast until the end of data. Parameters: data (pandas.DataFrame): Data returned by prepare_data (may be differentiated) startAt (int): Index where the forecast starts stopAt (int): Index where the forecast stops (default is None) **kwargs: Additionnal arguments for MLPRegressor Class Returns: predictions (list): The forecast from startAt up to stopAt """ data_copy = data.copy() if (stopAt is None): stopAt = len(data_copy) periods = stopAt - startAt from fastai.tabular import add_datepart add_datepart(data_copy, 'Date') data_copy.drop('Elapsed', axis=1, inplace=True) # setting importance of days before and after weekends # we assume that fridays and mondays are more important # 0 is Monday, 1 is Tuesday... data_copy['mon_fri'] = 0 data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]), 1, inplace=True) data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]), 0, inplace=True) train = data_copy[:startAt] valid = data_copy[startAt:stopAt] x_train = train.drop('Close', axis=1) y_train = train['Close'] x_valid = valid.drop('Close', axis=1) y_valid = valid['Close'] from sklearn.neural_network import MLPRegressor model = MLPRegressor(**kwargs) model.fit(x_train, y_train) predictions = model.predict(x_valid) return predictions
def svm(data, startAt, stopAt=None): """ Applies the Support Vector Machine to forecast data whose index is between startAt and stopAt. If stopAt is not provided, forecasts until the end of data. Parameters: data (pandas.DataFrame): Data returned by prepare_data (may be differentiated) startAt (int): Index where the forecast starts stopAt (int): Index where the forecast stops (default is None) Returns: predictions (list): The forecast from startAt up to stopAt """ data_copy = data.copy() if (stopAt is None): stopAt = len(data_copy) periods = stopAt - startAt from fastai.tabular import add_datepart add_datepart(data_copy, 'Date') data_copy.drop('Elapsed', axis=1, inplace=True) # setting importance of days before and after weekends # we assume that fridays and mondays are more important # 0 is Monday, 1 is Tuesday... data_copy['mon_fri'] = 0 data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]), 1, inplace=True) data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]), 0, inplace=True) train = data_copy[:startAt] valid = data_copy[startAt:stopAt] x_train = train.drop('Close', axis=1) y_train = train['Close'] x_valid = valid.drop('Close', axis=1) y_valid = valid['Close'] from sklearn import svm model = svm.SVR(gamma='scale', kernel='linear', degree=2, coef0=1) model.fit(x_train, y_train) predictions = model.predict(x_valid) return predictions
def linear_regression(data, startAt, stopAt=None): """ Applies the linear regression method to data to predict points whose index is between startAt and stopAt. If stopAt is not provided, default value is the length of data. Parameters: data (pandas.DataFrame): Data returned by prepare_data (may be differentiated) startAt (int): Index where the forecast starts stopAt (int): Index where the forecast stops (default is None) Returns: predictions (list): The forecast from startAt up to stopAt """ data_copy = data.copy() if (stopAt is None): stopAt = len(data_copy) periods = stopAt - startAt from fastai.tabular import add_datepart add_datepart(data_copy, 'Date') data_copy.drop('Elapsed', axis=1, inplace=True) # setting importance of days before and after weekends # we assume that fridays and mondays are more important # 0 is Monday, 1 is Tuesday... data_copy['mon_fri'] = 0 data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]), 1, inplace=True) data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]), 0, inplace=True) train = data_copy[:startAt] valid = data_copy[startAt:stopAt] x_train = train.drop('Close', axis=1) y_train = train['Close'] x_valid = valid.drop('Close', axis=1) y_valid = valid['Close'] from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(x_train, y_train) # calculating coefficients predictions = model.predict(x_valid) # Applies the regression return predictions
def ticker_svm(raw_data): data_copy = raw_data.copy() add_datepart(data_copy, 'date') data_copy.drop('Elapsed', axis=1, inplace=True) # setting importance of days before and after weekends # we assume that Fridays and Mondays are more important # 0 is Monday, 1 is Tuesday data_copy['mon_fri'] = 0 data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]), 1, inplace=True) data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]), 0, inplace=True) x_features = data_copy.drop('close', axis=1) y_features = data_copy['close'] return x_features, y_features
def _get_df_from_file(self): all_data_list = StockController.load_data_from_file(self.stock_symbol) df = pd.DataFrame(all_data_list).iloc[::-1] df["adj_close"] = df["adj_close"].astype(float) # add trend data df = df.set_index("date") df_trend = pd.read_csv(f"data/trends/{self.stock_symbol}.csv") df_merged = df.merge(df_trend.set_index("date"), how="inner", left_index=True, right_index=True) df = df_merged.reset_index()[["date", "adj_close", "bmw stock"]] add_datepart(df, "date") return df
def readRawData(relativeDataFolderPath, outputFileName="processedRawData.csv"): import os, sys, traceback import pandas as pd import glob from myUtilities import getParentFolder from myUtilities import createFolder from fastai.tabular import add_datepart # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath) # using various python commands like os.path.abspath and os.path.join jupyterNodePath = None # Variable to hold a dataframe created with the data from input data files in the relativeDataFolderPath provided inputRawDataDF = None # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath) # using various python commands like os.path.abspath and os.path.join dataFolderPath = None # Variable to hold query like value of python to query all json file names in the source folder (dataFolderPath). # Will be used in the glob function to execute the query json_pattern = None # Variable to contain the list of all input json file names in the source folder (dataFolderPath) file_list = None # return values of this method # ------------------------------------------------------------------------------- # Current methods return value initialized to false. Will be maked as true # after every single line in the method has been executed with out errors returnValue = False # complete filepath of the csv file with the processed raw data output_file_name = None # ------------------------------------------------------------------------------- try: #caluclate the deployment directory path of the current juypter node in the operating system jupyterNodePath = os.path.abspath(os.path.join('.')) # TO BE MODIFIED - NOT SURE WHY I USED THIS - WILL HAVE TO CHECK pd.set_option('display.max_columns', None) # creating pandas dataframe references for further modification inputRawDataDF = pd.DataFrame() #calculating the complete data folder path of the relative path provided as parameter dataFolderPath = jupyterNodePath + '/' + relativeDataFolderPath # creating OS queryable object for python to work with to find json files in the dataFolderPath calcuated in the previous step json_pattern = os.path.join(dataFolderPath, '*.json') # store all the json file paths in the dataFolderPath for further processing file_list = glob.glob(json_pattern) # execution assertion/ui progress update info print('looping through all the files to create input data') # loop through all the files in the folder and create inputRawDataDF pandas datafram for file in file_list: data = pd.read_json(file, lines=True) data = data.values[0][0]['candles'] inputRawDataDF = inputRawDataDF.append(data, ignore_index=True) inputRawDataDF.columns = [ 'date-time', 'open', 'high', 'low', 'close', 'quantity', 'dont-know' ] buffer = inputRawDataDF['date-time'] add_datepart(inputRawDataDF, 'date-time') inputRawDataDF = pd.concat([buffer, inputRawDataDF], axis=1) #create prior_holidays feature priorHolidaysStamps = getPriorHoliDaysStamps( inputRawDataDF['date-timeDayofyear']) priorHolidaysStamps_df = pd.DataFrame( {'prior_holidays': priorHolidaysStamps[:]}) inputRawDataDF = pd.concat([inputRawDataDF, priorHolidaysStamps_df], axis=1) #create following_holidays feature followingHolidaysStamps = getFollowingHolidaysDaysStamp( inputRawDataDF['date-timeDayofyear']) followingHolidaysStamps_df = pd.DataFrame( {'following_holidays': followingHolidaysStamps[:]}) inputRawDataDF = pd.concat( [inputRawDataDF, followingHolidaysStamps_df], axis=1) ''' w write mode r read mode a append mode w+ create file if it doesn't exist and open it in (over)write mode [it overwrites the file if it already exists] r+ open an existing file in read+write mode a+ create file if it doesn't exist and open it in append mode ''' output_csvdata_path = getParentFolder(dataFolderPath, 2) + '\\processed' print('Attempting to create folder if it does not exist >>>' + output_csvdata_path) createFolder(output_csvdata_path) output_file_name = output_csvdata_path + '/' + outputFileName print('Attempting to create/update file >>>' + output_file_name) #f = open(output_file_name, 'w+') # open file in append mode #f.write('') #f.close() #np.savetxt(output_file_name, inputRawDataDF, delimiter=",") inputRawDataDF.to_csv(output_file_name, sep=',', index=False) print( 'created raw easy to use csv data to be used for preparing training data in the location >>>' + output_file_name) returnValue = True except: print("Error executing method >>> ") # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Unexpected error:", sys.exc_info()) # print(exc_type, fname, exc_tb.tb_lineno) # http://docs.python.org/2/library/sys.html#sys.exc_info exc_type, exc_value, exc_traceback = sys.exc_info( ) # most recent (if any) by default ''' Reason this _can_ be bad: If an (unhandled) exception happens AFTER this, or if we do not delete the labels on (not much) older versions of Py, the reference we created can linger. traceback.format_exc/print_exc do this very thing, BUT note this creates a temp scope within the function. ''' traceback_details = { 'filename': exc_traceback.tb_frame.f_code.co_filename, 'lineno': exc_traceback.tb_lineno, 'name': exc_traceback.tb_frame.f_code.co_name, 'type': exc_type.__name__, 'message': traceback.extract_tb(exc_traceback) } del (exc_type, exc_value, exc_traceback ) # So we don't leave our local labels/objects dangling # This still isn't "completely safe", though! # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2] print print(traceback.format_exc()) print print(traceback_template % traceback_details) print #traceback.print_exception() raise finally: return [returnValue, output_file_name, outputFileName, inputRawDataDF]
def _process_layer(input_features, date_field, distance_layers, rasters): if isinstance(input_features, FeatureLayer): input_layer = input_features sdf = input_features.query().sdf else: sdf = input_features input_layer = sdf.spatial.to_feature_collection() if distance_layers: # Use proximity tool print("Calculating Distances.") count = 1 for distance_layer in distance_layers: output = arcgis.features.use_proximity.find_nearest( input_layer, distance_layer, max_count=1) connecting_df = output['connecting_lines_layer'].query().sdf near_dist = [] for i in range(len(connecting_df)): near_dist.append(connecting_df.iloc[i]['Total_Miles']) sdf[f'NEAR_DIST_{count}'] = near_dist count = count + 1 # Process Raster Data to get information. rasters_data = {} original_points = [] for i in range(len(sdf)): original_points.append(sdf.iloc[i]["SHAPE"]) input_layer_spatial_reference = sdf.spatial._sr for raster in rasters: raster_type = 0 if isinstance(raster, tuple): if raster[1] is True: raster_type = 1 raster = raster[0] rasters_data[raster.name] = [] shape_objects_transformed = arcgis.geometry.project( original_points, input_layer_spatial_reference, raster.extent['spatialReference']) for shape in shape_objects_transformed: shape['spatialReference'] = raster.extent['spatialReference'] if isinstance(shape, arcgis.geometry._types.Point): raster_value = raster.read(origin_coordinate=(shape['x'], shape['y']), ncols=1, nrows=1) value = raster_value[0][0][0] elif isinstance(shape, arcgis.geometry._types.Polygon): xmin, ymin, xmax, ymax = shape.extent start_x, start_y = xmin + (raster.mean_cell_width / 2), ymin + ( raster.mean_cell_height / 2) values = [] while start_y < ymax: while start_x < xmax: if shape.contains( arcgis.geometry._types.Point({ 'x': start_x, 'y': start_y, 'sr': raster.extent['spatialReference'] })): values.append( raster.read(origin_coordinate=( start_x - raster.mean_cell_width, start_y), ncols=1, nrows=1)[0][0][0]) start_x = start_x + raster.mean_cell_width start_y = start_y + raster.mean_cell_height start_x = xmin + (raster.mean_cell_width / 2) if len(values) == 0: values.append( raster.read( origin_coordinate=(shape.true_centroid['x'] - raster.mean_cell_width, shape.true_centroid['y']), ncols=1, nrows=1)[0][0][0]) if raster_type == 0: value = sum(values) / len(values) else: value = max(values, key=values.count) else: raise Exception( "Input features can be point or polygon only.") rasters_data[raster.name].append(value) # Append Raster data to sdf for key, value in rasters_data.items(): sdf[key] = value if date_field: try: add_datepart(sdf, date_field) except: pass return sdf
def knn(data, startAt, stopAt=None): """ Classifies the point between startAt and stopAt with the k-nearest neighbors method. Automaticaly finds the best number of neighbors. If stopAt is not provided, default value is the length of data. Parameters: data (pandas.DataFrame): Data returned by prepare_data (may be differentiated) startAt (int): Index where the forecast starts stopAt (int): Index where the forecast stops (default is None) Returns: predictions (list): The forecast from startAt up to stopAt """ data_copy = data.copy() if (stopAt is None): stopAt = len(data_copy) periods = stopAt - startAt from fastai.tabular import add_datepart add_datepart(data_copy, 'Date') data_copy.drop('Elapsed', axis=1, inplace=True) # setting importance of days before and after weekends # we assume that fridays and mondays are more important # 0 is Monday, 1 is Tuesday... data_copy['mon_fri'] = 0 data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]), 1, inplace=True) data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]), 0, inplace=True) train = data_copy[:startAt] valid = data_copy[startAt:stopAt] from sklearn import neighbors from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) x_train_scaled = scaler.fit_transform(train.drop('Close', axis=1)) x_train = pd.DataFrame(x_train_scaled) y_train = train['Close'] x_valid_scaled = scaler.fit_transform(valid.drop('Close', axis=1)) x_valid = pd.DataFrame(x_valid_scaled) y_valid = valid['Close'] params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]} knn = neighbors.KNeighborsRegressor() model = GridSearchCV(knn, params, cv=5, iid=False) model.fit(x_train, y_train) predictions = model.predict(x_valid) return predictions
parser.set_defaults(calculate_time_since_purchase_with_merchant=True) args = vars(parser.parse_args()) trans_df = pd.read_csv(args['transactions_csv'], parse_dates=['purchase_date']) # Suppress an annoying warning. pd.options.mode.chained_assignment = None # default='warn' # Treat categorical fields as categorical. for v in [ 'authorized_flag', 'category_1', 'category_2', 'category_3', 'merchant_id', 'merchant_category_id', 'subsector_id', 'city_id', 'state_id' ]: trans_df[v] = trans_df[v].astype('category').cat.as_ordered() # This function takes a date field and turns it into a bunch of useful # columns, such as "day of week", "is month end", etc. add_datepart(trans_df, 'purchase_date') # Sort by date. trans_df.sort_values(by=['purchase_Elapsed'], inplace=True) # Add new column: time since last purchase (in general or per merchant). add_time_since_last_purchase(trans_df) if args['calculate_time_since_purchase_with_merchant']: add_time_since_last_purchase_with_merchant(trans_df) trans_df.to_csv(args['outfile'])
def process_data(df, hist_trans_df, merch_trans_df): # Extract more useful information from the `first_active_month` date field. add_datepart(df, 'first_active_month') df.drop([ 'first_active_monthDay', 'first_active_monthDayofweek', 'first_active_monthDayofyear', 'first_active_monthIs_month_end', 'first_active_monthIs_month_start', 'first_active_monthIs_quarter_end', 'first_active_monthIs_year_end' ], axis=1, inplace=True) # Do feature engineering by aggregating data from the transactions tables. aggs = { 'purchase_amount': ['sum', 'mean', 'min', 'max', 'std'], 'installments': ['sum', 'mean', 'min', 'max', 'std'], 'month_lag': ['mean', 'min', 'max'], 'merchant_id': ['nunique'], 'state_id': ['nunique'], 'city_id': ['nunique'], 'numerical_1': ['sum', 'mean', 'min', 'max', 'std'], 'numerical_2': ['sum', 'mean', 'min', 'max', 'std'], 'avg_sales_lag3': ['sum', 'mean', 'min', 'max', 'std'], 'avg_sales_lag6': ['sum', 'mean', 'min', 'max', 'std'], 'avg_sales_lag12': ['sum', 'mean', 'min', 'max', 'std'], 'avg_purchases_lag3': ['sum', 'mean', 'min', 'max', 'std'], 'avg_purchases_lag6': ['sum', 'mean', 'min', 'max', 'std'], 'avg_purchases_lag12': ['sum', 'mean', 'min', 'max', 'std'], 'active_months_lag3': ['sum', 'mean', 'min', 'max', 'std'], 'active_months_lag6': ['sum', 'mean', 'min', 'max', 'std'], 'active_months_lag12': ['sum', 'mean', 'min', 'max', 'std'], 'merchant_category_id_transaction': ['nunique'], 'merchant_category_id_merchant': ['nunique'], 'subsector_id_transaction': ['nunique'], 'subsector_id_merchant': ['nunique'], 'merchant_group_id': ['nunique'], 'most_recent_sales_range': ['nunique'], 'most_recent_purchases_range': ['nunique'], 'elapsed_since_last_purchase': ['sum', 'mean', 'min', 'max', 'std'], } # First up we aggregate the data in the `historical_transactions` table. hist_trans_aggs = { 'elapsed_since_last_merch_purchase': ['sum', 'mean', 'min', 'max', 'std'], } print('Aggregating numerical fields from the historical transactions ...') add_aggregated_numerical_fields(df, hist_trans_df, aggregators={ **aggs, **hist_trans_aggs }) # For the categorical fields, we can't aggregate by taking the mean or sum # values, so let's count the occurences of each possible categorical value # instead. (Iow, for a category that can be either YES or NO, we count the # number of YESes and the number of NOs and use those values.) print( 'Aggregating categorical fields from the historical transactions ...') add_aggregated_categorical_fields( df, hist_trans_df, column_names=[ 'authorized_flag', 'category_1_transaction', 'category_1_merchant', 'category_2', 'category_3', 'category_4', 'purchase_Is_month_start', 'purchase_Is_month_end', 'purchase_Year', 'most_recent_sales_range', 'most_recent_purchases_range' ]) print( 'Getting top values for categorical fields from the historical transactions ...' ) add_top_categories( df, hist_trans_df, column_names=[ 'authorized_flag', 'category_1_transaction', 'category_1_merchant', 'category_2', 'category_3', 'category_4', 'subsector_id_transaction', 'subsector_id_merchant', 'city_id', 'state_id', 'purchase_Year', 'purchase_Month', 'purchase_Week', 'purchase_Day', 'purchase_Dayofweek', 'most_recent_sales_range', 'most_recent_purchases_range' ]) # Next we aggregate the data in the `new_merchants_transactions` table. print( 'Aggregating numerical fields from the new merchant transactions ...') add_aggregated_numerical_fields(df, merch_trans_df, aggregators=aggs, prefix='merch_') # These ones don't work for the new_merchant_transactions for some reason # (missing data?), so let's skip them for now ... print( 'Aggregating categorical fields from the new merchant transactions ...' ) add_aggregated_categorical_fields( df, merch_trans_df, column_names=[ 'authorized_flag', 'category_1_transaction', 'category_1_merchant', 'category_2', 'category_3', 'category_4', 'purchase_Is_month_start', 'purchase_Is_month_end', 'purchase_Year', 'most_recent_sales_range', 'most_recent_purchases_range' ], prefix='merch_') print( 'Getting top values for categorical fields from the new merchant transactions ...' ) add_top_categories( df, merch_trans_df, column_names=[ 'authorized_flag', 'category_1_transaction', 'category_1_merchant', 'category_2', 'category_3', 'category_4', 'subsector_id_transaction', 'subsector_id_merchant', 'city_id', 'state_id', 'purchase_Year', 'purchase_Month', 'purchase_Week', 'purchase_Day', 'purchase_Dayofweek', 'most_recent_sales_range', 'most_recent_purchases_range' ], prefix='merch_')
plt.figure(figsize=(16, 8)) plt.plot(data['Close'], label='Closing price history') # Creating new dataframe with only Date and Close data = data.sort_index(ascending=True, axis=0) new_data = pd.DataFrame(index=range(0, len(data)), columns=['Date', 'Close']) for i in range(0, len(data)): new_data['Date'][i] = data.index[i] new_data['Close'][i] = data['Close'][i] # Setting index new_data.index = new_data.Date new_data.drop('Date', axis=1, inplace=True) """ # Adding date features using 'fastai' add_datepart(new_data, 'Date') new_data.drop('Elapsed', axis=1, inplace=True) # elapsed will be the time stamp # Flagging if date is Monday/Friday new_data['mon_fri'] = 0 for i in range(0, len(new_data)): if (new_data['Dayofweek'][i] == 0 or new_data['Dayofweek'][i] == 4): new_data['mon_fri'][i] = 1 else: new_data['mon_fri'][i] = 0 """ # 80% train, 20% test dataset = new_data.values
# # Plot with plotly # data = [go.Scatter( # x = df['date'], # y = df['price'], # mode = 'lines')] # # layout = dict(xaxis = dict(title = 'date'), # yaxis = dict(title = 'USD')) # # fig = dict(data=data, layout=layout) # py.plot(fig, filename='price_plot') ### Feature engineering: add_datepart(df, 'date', drop=False) df.drop('Elapsed', axis=1, inplace=True) # don't need this df.head(50) df.columns = [str(x).lower().replace(' ', '_') for x in df.columns] df.loc[:, 'year'] = LabelEncoder().fit_transform(df['year']) df[15:25] df.isnull().sum() # # Compute the average price for each month # avg_price_mth = df.groupby("month").agg({'price': 'mean'}).reset_index() # # Plot # data = [go.Scatter(
""" Linear Regression """ #sorting data = df.sort_index(ascending=True, axis=0) #creating a separate dataset new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close']) for i in range(0,len(data)): new_data['Date'][i] = data['Date'][i] new_data['Close'][i] = data['Close'][i] #create features add_datepart(new_data, 'Date') new_data.drop('Elapsed', axis=1, inplace=True) #elapsed will be the time stamp #split into train and validation train = new_data[:7080] valid = new_data[7080:] x_train = train.drop('Close', axis=1) y_train = train['Close'] x_valid = valid.drop('Close', axis=1) y_valid = valid['Close'] #implement linear regression
def KNearestNeighbours(df, type, split): # creating dataframe with date and the target variable data = df.sort_index(ascending=True, axis=0) new_data = pd.DataFrame(index=range(0, len(df)), columns=['Date', type]) for i in range(0, len(data)): new_data['Date'][i] = data['Date'][i] new_data[type][i] = data[type][i] # create features from fastai.tabular import add_datepart add_datepart(new_data, 'Date') new_data.drop('Elapsed', axis=1, inplace=True) # elapsed will be the time stamp, axis=1表示删除列 new_data['mon_fri'] = 0 for i in range(0, len(new_data)): if (new_data['Dayofweek'][i] == 0 or new_data['Dayofweek'][i] == 4): new_data['mon_fri'][i] = 1 else: new_data['mon_fri'][i] = 0 # split into train and validation train = new_data[:split] valid = new_data[split:] x_train = train.drop(type, axis=1) y_train = train[type] x_valid = valid.drop(type, axis=1) y_valid = valid[type] # importing libraries from sklearn import neighbors from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import MinMaxScaler # scaling data scaler = MinMaxScaler(feature_range=(0, 1)) x_train_scaled = scaler.fit_transform(x_train) # 对x_train进行归一化处理 x_train = pd.DataFrame(x_train_scaled) x_valid_scaled = scaler.fit_transform(x_valid) # 对x_valid进行归一化处理 x_valid = pd.DataFrame(x_valid_scaled) # using gridsearch to find the best parameter params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]} knn = neighbors.KNeighborsRegressor() model = GridSearchCV(knn, params, cv=5) # fit the model and make predictions model.fit(x_train, y_train) preds = model.predict(x_valid) rmse = np.sqrt(np.mean(np.power((np.array(y_valid) - np.array(preds)), 2))) st.write('RMSE value on validation set:') st.write(rmse) # plot valid['Predictions'] = 0 valid['Predictions'] = preds valid.index = new_data[split:].index train.index = new_data[:split].index append_data = DataFrame(data={type: [], 'Predictions': []}) append_data[type] = train[type] append_data['Predictions'] = train[type] pic = pd.concat( [append_data[[type, 'Predictions']], valid[[type, 'Predictions']]], axis=0) st.line_chart(pic)