def transform_values(data, n_lags, n_series, dim): reframed = utils.series_to_supervised(data, n_lags, n_series) # wall = 200 - (n_lags - 1) wall = len(reframed) - 25 + n_series values = reframed.values n_features = data.shape[1] n_obs = n_lags * n_features cols = ['var1(t)'] cols += ['var1(t+%d)' % (i) for i in range(1, n_series)] y_o = reframed[cols].values train_X, train_y = values[:wall, :n_obs], y_o[:wall, -n_series:] test_X, test_y = values[wall:, :n_obs], y_o[wall:, -n_series:] if (dim): train_X = train_X.reshape((train_X.shape[0], n_lags, n_features)) test_X = test_X.reshape((test_X.shape[0], n_lags, n_features)) return train_X, test_X, train_y, test_y
# plt.plot(dataset) # plt.xlabel('time(ms)') # plt.subplot(2,1,2) # plt.semilogy(Pxx_den) # plt.xlabel('frequency (hz)') #plt.xlim(0,100) from utils import series_to_supervised, fit_lstm, forecast_lstm, make_forecasts # Series to supervised num_lookback = 200 num_predict = 10 # n_in is the number of samples for "input" and n_out is the number for "output", or prediction supervised_dataset = series_to_supervised(dataset, n_in=num_lookback, n_out=num_predict) sup_ds = supervised_dataset.values # Scale row_mean = sup_ds.mean(axis=1, keepdims=True) row_max = sup_ds.max(axis=1, keepdims=True) row_min = sup_ds.min(axis=1, keepdims=True) scl_sup_ds = (sup_ds - row_mean) / (row_max - row_min) # Train test split N_train = int(0.8 * np.size(scl_sup_ds, 0)) scaled_train = scl_sup_ds[0:N_train, :] scaled_test = scl_sup_ds[N_train:, :]
# %%%%%Rede Gating - Perceptron%%%%%%%%%%%%%%%%%%%%%%%%% # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Xtr - entrada de treinamento # Ytr - saida de treinamento # Wg - rede gating # W - especialistas filename = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname('treinamento.txt'))) + '/treinamento-1.txt' series = pandas.read_csv(filename, header=None) if __name__ == "__main__": D = series_to_supervised(series, 21).values k = 4 folded_partitions = k_fold(D, k) for partition in folded_partitions: Dtr = folded_partitions[partition][0] Dv = folded_partitions[partition][1] Xtr = Dtr[:, 0:-1] Ytr = Dtr[:, -1].reshape(Xtr.shape[0], 1) Xv = Dv[:, 0:-1] Yv = Dv[:, -1].reshape(Xv.shape[0], 1) m = 6
plt.plot(time,LFP[:,0]) plt.plot(time,LFP_filt) plt.xlabel('time(ms)') plt.subplot(2,1,2) plt.semilogy(f,Pxx_den) plt.xlabel('frequency (hz)') plt.show() ############# Process data for Neural Network ############## # Series to supervised num_lookback = 100 num_predict = 15 # n_in is the number of samples for "input" and n_out is the number for "output", or prediction supervised_dataset = series_to_supervised(LFP_filt[:,0:1],n_in=num_lookback,n_out=num_predict) sup_ds_filt = supervised_dataset.values supervised_dataset = series_to_supervised(LFP[:,0:1],n_in=num_lookback,n_out=num_predict) sup_ds_raw = supervised_dataset.values # train model_lstm = fit_lstm(sup_ds_filt, num_predict, 10, nb_epoch = 1, n_neurons=1000) model_lstm.summary() # Make forecasts on test data scaled_forecasts = make_forecasts(model_lstm, 1, sup_ds_filt, num_predict) # Make persistence forecasts y_pers_test = np.transpose(np.tile(sup_ds_filt[:,-num_predict-1], (num_predict,1)))
def __init__(self, csv_file: str = '../data/daily_MSFT.csv', use_keras: bool = False, index_of_plotted_feature: int = 0, num_of_previous_days: int = 7, num_of_future_days: int = 3, num_of_hidden_neurons: int = 256, train_percentage: int = 80, bias_term: int = 1): """ Constructor parses the CSV file and initializes the training and test data Parameters ---------- csv_file: str The path to the CSV file (from AlphaVantage API) to be parsed use_keras: bool If true, the model will be a "standard" neural network implemented using the Keras API, if false, the model will be an ELM index_of_plotted_feature: int The class after training and predicting generates an array that can be plotted. This number is between 0 and 4. 0: Open, 1: High, 2: Low, 3: Close, 4: Volume. Note: 4 (volume) is currently not supported. num_of_previous_days: int The number of previous days needed in order to make a single prediction num_of_future_days: int Given num_of_previous_days, we predict num_of_future_days number of future days num_of_hidden_neurons: int Number of neurons in the hidden layer train_percentage: int Number between 0 and 100 that represents the percentage of the CSV file data that will be training data The remainder will be test data bias_term: int = 1 Column that is prepended to the X data matrix (bias term) """ df = pd.read_csv(csv_file) # By default header will be read from file # print('Head of data frame: \n' + str(df.head())) # print('Dimensions of data frame (row x col)' + str(df.shape)) self.index_of_plotted_feature = index_of_plotted_feature self.plotted_feature_str = \ {0: 'Open', 1: 'High', 2: 'Low', 3: 'Close', 4: 'Volume'}[self.index_of_plotted_feature] # Stores the bias (an integer) self.bias = bias_term self.use_keras = use_keras self.model_type_str = None self.num_of_rows_in_csv = int(df.shape[0]) # Assume 4 features because volume is ignored self.num_features = 4 self.num_hidden_layer_neurons = num_of_hidden_neurons self.num_prev_timesteps = num_of_previous_days self.num_future_timesteps = num_of_future_days # Stores the amount of columns in a row that are needed to make a future prediction self.num_prev_attributes = self.num_prev_timesteps * self.num_features # Stores the amount of columns in a row that represent a future prediction self.num_future_attributes = self.num_future_timesteps * self.num_features # Initialize a NumPy array of values that will store our stock data self.df_values = df[['open', 'high', 'low', 'close']].values # Frame our data as rolling window series prediction using series_to_supervised() method self.reframed_x_y = utils.series_to_supervised(self.df_values, self.num_prev_timesteps, self.num_future_timesteps) # Convert the Pandas DataFrame to NumPy array self.x_y_values = self.reframed_x_y.values self.plotable_dates = utils.convert_to_matplot_dates(df) self.plotable_dates = np.reshape(self.plotable_dates, newshape=(-1, 1)).copy() self.reframed_dates = utils.series_to_supervised(self.plotable_dates, self.num_prev_timesteps, self.num_future_timesteps) # Initialize variables that will store the Mean Squared Error self.mse_train_cost = -1 self.mse_test_cost = -1 # All of the variables that have the word "plotable" are standard python lists that contain # Values of a specific attribute (for example "Close") as specified by index_of_plotted_feature self.plotable_y_train_real = None self.plotable_y_train_pred = None self.plotable_y_test_real = None self.plotable_y_test_pred = None self.plotable_y_future_pred = None # The amount of rows in our matrix self.data_size = self.x_y_values.shape[0] # Separate our data into training and test sets self.training_set_size = int((train_percentage / 100) * self.data_size) self.test_set_size = int(self.data_size - self.training_set_size) print('Rolling window model: Training set size: ' + str(self.training_set_size)) print('Rolling window model: Test set size: ' + str(self.test_set_size)) # Below we store integers that matplotlib uses for the plot_date() method so we can plot # Dates on the X-axis in a neat way self.train_dates = self.reframed_dates.values[self.test_set_size:, -1:].flatten() self.test_dates = self.reframed_dates.values[:self.test_set_size, -1:].flatten() self.model = None # In our y, each row contains more than 1 time step worth of data (due to rolling window prediction) # However it makes sense only to plot 1 y in each row # Therefore we only plot the last prediction of y given a sequence of values # For example if num_future_timesteps is 5, then from each row plot the selected feature at t+4 # (Since then our y would contain predictions for t, t+1, t+2, t+3, t+4) # If we predict 2 future days given 1 day, then one row in self.x_y_values would have the following format: # [Open(t-1), High(t-1), Low(t-1), Close(t-1), Open(t), High(t), Low(t), Close(t), Open(t+1), High(t+1), Low(t+1), Close(t+1)] # self.num_prev_attributes in this example is 4, because the first 4 values in the row represent the data needed to predict the future # self.num_future_attributes would be 8, because the last 8 values represent the all the future data if self.use_keras: self.model_type_str = 'Keras' self.x_tr = self.x_y_values[self.test_set_size:, :self.num_prev_attributes] self.x_te = self.x_y_values[:self.test_set_size, :self.num_prev_attributes] self.y_tr = self.x_y_values[self.test_set_size:, -self.num_future_attributes:] self.y_te = self.x_y_values[:self.test_set_size, -self.num_future_attributes:] self.plotable_y_train_real = self.y_tr[:, -(self.num_features + self.index_of_plotted_feature)].flatten() self.plotable_y_test_real = self.y_te[:, -(self.num_features + self.index_of_plotted_feature)].flatten() self.input_shape = (self.num_prev_attributes,) self.model = Sequential() self.model.add(Dense(self.num_hidden_layer_neurons, input_shape=self.input_shape, activation='linear')) self.model.add(Dense(self.num_future_attributes)) self.model.compile(loss='mse', optimizer='adam') print('Created a keras model for disjoint X and Y stock data, summary: ') self.model.summary() else: self.model_type_str = 'ELM' self.x_tr = np.mat(self.x_y_values[self.test_set_size:, :self.num_prev_attributes]) self.x_te = np.mat(self.x_y_values[:self.test_set_size, :self.num_prev_attributes]) self.y_tr = np.mat(self.x_y_values[self.test_set_size:, -self.num_future_attributes:]) self.y_te = np.mat(self.x_y_values[:self.test_set_size, -self.num_future_attributes:]) self.plotable_y_train_real = \ self.y_tr[:, -(self.num_features + self.index_of_plotted_feature)].flatten().tolist()[0] self.plotable_y_test_real = \ self.y_te[:, -(self.num_features + self.index_of_plotted_feature)].flatten().tolist()[0] # Add bias term of ones to test X and train X self.x_tr = np.concatenate((np.ones(shape=(self.x_tr.shape[0], 1)) * self.bias, self.x_tr), axis=1) self.x_te = np.concatenate((np.ones(shape=(self.x_te.shape[0], 1)) * self.bias, self.x_te), axis=1) # Our beta will be a weight matrix between the hidden and output layer self.input_layer_weights = np.mat( utils.rand_init(shape=(self.num_hidden_layer_neurons, self.num_prev_attributes + 1))) self.beta = None
def __init__(self, csv_file: str = '../data/daily_MSFT.csv', index_of_plotted_feature: int = 0, num_of_previous_days: int = 7, num_of_future_days: int = 3, num_of_hidden_neurons: int = 256, train_percentage: int = 80): self.dataset = read_csv(csv_file, header=0) values = self.dataset[['open', 'high', 'low', 'close', 'volume']].values values = values.astype('float32') self.num_of_prev_timesteps = num_of_previous_days self.num_of_future_timesteps = num_of_future_days self.num_features = 5 self.num_prev_objs = self.num_features * self.num_of_prev_timesteps self.num_future_objs = self.num_features * self.num_of_future_timesteps # open = 0, high = 1, low = 2, close = 3, volume = 4 self.index_of_plotted_feature = index_of_plotted_feature # The feature that shall be plotted (all will be predicted) self.plotted_feature_str = \ {0: 'Open', 1: 'High', 2: 'Low', 3: 'Close', 4: 'Volume'}[self.index_of_plotted_feature] self.num_rolling_days_ahead = 30 self.unscaled_values = values.copy() # normalize features self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaled = self.scaler.fit_transform(values) # frame as supervised learning self.reframed = utils.series_to_supervised( self.scaled, self.num_of_prev_timesteps, self.num_of_future_timesteps) reframed_unscaled = utils.series_to_supervised( self.unscaled_values, self.num_of_prev_timesteps, self.num_of_future_timesteps) self.unscaled_values = reframed_unscaled.values self.plotable_dates = utils.convert_to_matplot_dates(self.dataset) self.plotable_dates = np.reshape(self.plotable_dates, newshape=(-1, 1)).copy() self.reframed_dates = utils.series_to_supervised( self.plotable_dates, self.num_of_prev_timesteps, self.num_of_future_timesteps) self.scaled_values = self.reframed.values # Extract numpy array from a pandas DataFrame self.plotable_y_train_real = None self.plotable_y_train_pred = None self.plotable_y_test_real = None self.plotable_y_test_pred = None self.plotable_y_future_pred = None self.mse_train_cost = -1 self.mse_test_cost = -1 self.last_observations = self.scaled_values[0, -self.num_prev_objs:] self.data_size = self.reframed.shape[0] # Training set is 80% of the examples self.training_set_size = int((train_percentage / 100) * self.data_size) self.test_set_size = self.data_size - self.training_set_size self.train_dates = self.reframed_dates.values[self.test_set_size:, -1:].flatten() self.test_dates = self.reframed_dates.values[:self.test_set_size, -1:].flatten() # split into input and outputs # Training set contains the older (time-wise) part of data self.training_set_x_y = self.scaled_values[self.test_set_size:, :] # Test set has the newer (time-wise) part of data self.test_set_x_y = self.scaled_values[:self.test_set_size, :] print(self.dataset.head()) self.unscaled_train_x_y = self.unscaled_values[self.test_set_size:, :] self.unscaled_test_x_y = self.unscaled_values[:self.test_set_size, :] unscaled_train_y, unscaled_test_y = self.unscaled_train_x_y[:, -self. num_future_objs:], self.unscaled_test_x_y[:, -self . num_future_objs:] self.train_x, self.train_y = self.training_set_x_y[:, :self. num_prev_objs], self.training_set_x_y[:, -self . num_future_objs:] self.test_x, self.test_y = self.test_set_x_y[:, :self. num_prev_objs], self.test_set_x_y[:, -self . num_future_objs:] self.plotable_y_train_real = unscaled_train_y[:, -( self.num_features + self.index_of_plotted_feature)].flatten() self.plotable_y_test_real = unscaled_test_y[:, -( self.num_features + self.index_of_plotted_feature)].flatten() # reshape input to be 3D [samples, timesteps, features] as expected by LSTM self.train_x = self.train_x.reshape(self.train_x.shape[0], self.num_of_prev_timesteps, self.num_features) self.test_x = self.test_x.reshape(self.test_x.shape[0], self.num_of_prev_timesteps, self.num_features) self.last_observations_reshaped = self.last_observations.reshape( 1, self.num_of_prev_timesteps, self.num_features) print('LSTM Training input size: ' + str(self.train_x.shape) + '\n' + 'Training output size: ' + str(self.train_y.shape) + '\n' + 'Test input size: ' + str(self.test_x.shape) + '\n' + 'Test output size: ' + str(self.test_y.shape)) print('test_set_x_y: \n' + str(self.test_set_x_y)) # Create LSTM model self.model = Sequential() self.model.add( LSTM(num_of_hidden_neurons, input_shape=(self.train_x.shape[1], self.train_x.shape[2]))) self.model.add(Dense(self.num_future_objs)) self.model.compile(loss='mae', optimizer='adam') self.model.summary()
def __init__(self, csv_file: str = '../data/daily_MSFT.csv'): self.dataset = read_csv(csv_file, header=0) print('CSV columns: ' + str(self.dataset.columns.tolist())) values = self.dataset[['open', 'high', 'low', 'close', 'volume']].values values = values.astype('float32') self.num_of_prev_timesteps = 7 self.num_of_future_timesteps = 2 self.num_features = 5 self.num_prev_objs = self.num_features * self.num_of_prev_timesteps self.num_future_objs = self.num_features * self.num_of_future_timesteps # open = 0, high = 1, low = 2, close = 3, volume = 4 self.index_of_plotted_feature = 0 # The feature that shall be plotted (all will be predicted) self.num_rolling_days_ahead = 30 # normalize features self.scaler = MinMaxScaler(feature_range=(0, 1)) print('Shape of values before transforming: ' + str(values.shape)) self.scaled = self.scaler.fit_transform(values) # frame as supervised learning self.reframed = utils.series_to_supervised( self.scaled, self.num_of_prev_timesteps, self.num_of_future_timesteps) print('reframed: \n' + str(self.reframed.head())) self.scaled_values = self.reframed.values # Extract numpy array from a pandas DataFrame print('Dim of scaled_values: ' + str(self.scaled_values.shape)) self.last_observations = self.scaled_values[:self. num_of_future_timesteps, -self.num_prev_objs:] print('last_observations: \n' + str(self.last_observations)) self.data_size = self.reframed.shape[0] print('Data size: ' + str(self.data_size)) # Training set is 80% of the examples self.training_set_size = int(0.8 * self.data_size) self.test_set_size = self.data_size - self.training_set_size # split into input and outputs # Training set contains the older (time-wise) part of data self.training_set_x_y = self.scaled_values[self.test_set_size:, :] # Test set has the newer (time-wise) part of data self.test_set_x_y = self.scaled_values[:self.test_set_size, :] print(self.dataset.head()) self.train_x, self.train_y = self.training_set_x_y[:, :self. num_prev_objs], self.training_set_x_y[:, -self . num_features:] self.test_x, self.test_y = self.test_set_x_y[:, :self. num_prev_objs], self.test_set_x_y[:, -self . num_features:] # reshape input to be 3D [samples, timesteps, features] as expected by LSTM self.train_x = self.train_x.reshape(self.train_x.shape[0], self.num_of_prev_timesteps, self.num_features) self.test_x = self.test_x.reshape(self.test_x.shape[0], self.num_of_prev_timesteps, self.num_features) self.last_observations_reshaped = self.last_observations.reshape( self.last_observations.shape[0], self.num_of_prev_timesteps, self.num_features) print('Training input size: ' + str(self.train_x.shape) + '\n' + 'Training output size: ' + str(self.train_y.shape) + '\n' + 'Test input size: ' + str(self.test_x.shape) + '\n' + 'Test output size: ' + str(self.test_y.shape)) print('test_set_x_y: \n' + str(self.test_set_x_y)) # Create LSTM model self.model = Sequential() self.model.add( LSTM(512, input_shape=(self.train_x.shape[1], self.train_x.shape[2]))) self.model.add(Dense(self.num_features)) self.model.compile(loss='mae', optimizer='adam') self.model.summary()
for fold in folds: train_folds = [folds[x] for x in range(1, k + 1) if x != fold] Dtr = np.concatenate(train_folds) Dv = folds[fold] folded_partitions[fold] = [Dtr, Dv] return folded_partitions if __name__ == '__main__': filename = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname('treinamento.txt'))) + '/treinamento-1.txt' series = pandas.read_csv(filename, header=None) D = series_to_supervised(series, 15).values k = 4 folded_partitions = k_fold(D,k) for partition in folded_partitions: Dtr = folded_partitions[partition][0] Dv = folded_partitions[partition][1] Xtr = Dtr[:, 0:-1] Ytr = Dtr[:, -1].reshape(Xtr.shape[0], 1) Xv = Dv[:, 0:-1] Yv = Dv[:, -1].reshape(Xv.shape[0], 1) print("Xtr", partition, ": ", Xtr)
SEQ_LEN = 10 BATCH = 1 EPOCH = 100 # In[30]: time_points = np.linspace(start=0, stop=1, num=1000, dtype=np.float32) time_points = np.reshape(time_points, (-1, 1)) # In[31]: # data_seq_generator = rnn_minibatch_sequencer(raw_data=time_points, batch_size=BATCH, sequence_size=SEQ_LEN, # nb_epochs=EPOCH) data_seq_generator = series_to_supervised(time_points, n_in=SEQ_LEN, n_out=1).values print(data_seq_generator) x = data_seq_generator[:, 0:SEQ_LEN].reshape(-1, SEQ_LEN, 1) x = x[0:len(x) - (len(x) % BATCH)] y = data_seq_generator[:, SEQ_LEN] y = y[0:len(y) - (len(y) % BATCH)] print(x.shape) print(y.shape) # In[40]: from keras import Sequential from keras.layers import LSTM, Dense
def build_and_train_td_file_level(horizon_param, project_param, project_files_param, regressor_param, ground_truth_param): """ Build file-level TD forecasting models and return forecasts for an horizon specified by the user. Arguments: horizon_param: The forecasting horizon up to which forecasts will be produced. project_param: The project for which the forecasts will be produced. project_files_param: The number of files for which the forecasts will be produced. regressor_param: The regressor models that will be used to produce forecasts. ground_truth_param: If the model will return also ground truth values or not. Returns: A dictionary containing fike-level forecasted values (and ground thruth values if ground_truth_param is set to yes) of the selected project, for a number of files specified by the user and for each intermediate step ahead up to the specified horizon. """ # Read file-level dataset try: dataset_td_file = pd.read_csv('data/%s_class.csv' % project_param, sep=";") except FileNotFoundError as e: if debug: print(e) return -2 # selecting indicators that will be used as model variables metrics_td = [ 'code_smells', 'ncloc', 'complexity', 'duplicated_blocks', 'total_principal' ] # Select sliding window length window_size = 2 # Compute change proneness and TD change proneness for each file files_change_metrics_df = pd.DataFrame() for file_id in dataset_td_file['class_id'].unique().tolist(): # create temporary file dataframe temp_file_df = dataset_td_file[dataset_td_file['class_id'] == file_id] temp_file_metr_dict = {} temp_file_name = temp_file_df.class_name.iloc[0] temp_file_metr_dict['file_id'] = file_id temp_file_metr_dict['file_name'] = temp_file_name temp_file_metr_dict['versions'] = temp_file_df.shape[0] temp_file_metr_dict[ 'td_of_last_version'] = temp_file_df.total_principal.iloc[-1] temp_file_metr_dict[ 'complexity_of_last_version'] = temp_file_df.complexity.iloc[-1] # compute number of changes in LOC across versions of a file ncloc_has_changed_list = temp_file_df.ncloc == temp_file_df.ncloc.shift( ) ncloc_has_changed_list = [ 1 if i == False else 0 for i in ncloc_has_changed_list ] file_df_changes = sum(ncloc_has_changed_list) temp_file_metr_dict['number_of_changes'] = file_df_changes # compute LOC Change Proneness of a file file_df_cp = (file_df_changes / temp_file_df.shape[0]) temp_file_metr_dict['change_proneness_(CP)'] = file_df_cp # compute number of changes in TD across versions of a file td_has_changed_list = temp_file_df.total_principal == temp_file_df.total_principal.shift( ) td_has_changed_list = [ 1 if i == False else 0 for i in td_has_changed_list ] file_df_changes_td = sum(td_has_changed_list) temp_file_metr_dict['number_of_td_changes'] = file_df_changes_td # compute TD Change Proneness of a file file_df_cp_td = (file_df_changes_td / temp_file_df.shape[0]) temp_file_metr_dict['change_proneness_td_(CP-TD)'] = file_df_cp_td # compute number of changes in complexity across versions of a file complexity_has_changed_list = temp_file_df.complexity == temp_file_df.complexity.shift( ) complexity_has_changed_list = [ 1 if i == False else 0 for i in complexity_has_changed_list ] file_df_complexity_changes = sum(complexity_has_changed_list) temp_file_metr_dict[ 'number_of_complexity_changes'] = file_df_complexity_changes # compute complexity Change Proneness of a file file_df_CP_complexity = (file_df_complexity_changes / temp_file_df.shape[0]) temp_file_metr_dict[ 'change_proneness_complexity_(CP-COMP)'] = file_df_CP_complexity # compute average size of changes in LOC across versions of a file ncloc_changes_volume_list = temp_file_df['ncloc'].diff(periods=1) ncloc_changes_volume_list.fillna(0, inplace=True) file_df_expected_changes = sum(ncloc_changes_volume_list) / ( temp_file_df.shape[0] - 1) temp_file_metr_dict[ 'expected_size_change_(ED-LOC)'] = file_df_expected_changes # compute average size of changes in TD across versions of a file td_changes_volume_list = temp_file_df['total_principal'].diff( periods=1) td_changes_volume_list.fillna(0, inplace=True) file_df_expected_td_changes = sum(td_changes_volume_list) / ( temp_file_df.shape[0] - 1) temp_file_metr_dict[ 'expected_td_change_(ED-TD)'] = file_df_expected_td_changes # compute average size of changes in complexity across versions of a file complexity_changes_volume_list = temp_file_df['complexity'].diff( periods=1) complexity_changes_volume_list.fillna(0, inplace=True) file_df_expected_complexity_changes = sum( complexity_changes_volume_list) / (temp_file_df.shape[0] - 1) temp_file_metr_dict[ 'expected_complexity_change_(ED-COMP)'] = file_df_expected_complexity_changes temp_file_metr_df = pd.DataFrame.from_records( [temp_file_metr_dict], index='file_id', columns=temp_file_metr_dict.keys()) files_change_metrics_df = files_change_metrics_df.append( temp_file_metr_df) # Sort files by Change Proneness (CP) files_change_metrics_df.sort_values(by=['change_proneness_(CP)'], ascending=False, inplace=True) # Keep only first n files, where n = project_files_param files_change_metrics_df = files_change_metrics_df.head(project_files_param) # Initialise variables dict_result = { 'parameters': { 'project': project_param, 'files': project_files_param, 'horizon': horizon_param, 'regressor': regressor_param, 'ground_truth': ground_truth_param } } list_forecasts = [] list_metrics = [] list_ground_truth = [] # Compute forecasts for each file for index, file_instance in files_change_metrics_df.iterrows(): if debug: print( '=========================== File: %s ============================' % file_instance['file_name']) temp_file_df = dataset_td_file.loc[dataset_td_file['class_id'] == index] temp_file_df.reset_index(inplace=True, drop=True) temp_dataset_td_file = temp_file_df[metrics_td] # Fill list with metrics of files temp_metrics_dict = { file_instance['file_name']: pd.DataFrame(file_instance).T.to_dict('records')[0] } list_metrics.append(temp_metrics_dict) temp_list_forecasts = [] # Make forecasts using the Direct approach, i.e. train separate ML models for each forecasting horizon for intermediate_horizon in range(1, horizon_param + 1): if debug: print( '=========================== Horizon: %s ============================' % intermediate_horizon) # Add time-shifted prior and future period data = series_to_supervised(temp_dataset_td_file, n_in=window_size) # Append dependend variable column with value equal to total_principal of the target horizon's version data['forecasted_total_principal'] = data[ 'total_principal(t)'].shift(-intermediate_horizon) data = data.drop(data.index[-intermediate_horizon:]) # Remove TD as independent variable data = data.drop(columns=[ 'total_principal(t-%s)' % (i) for i in range(window_size, 0, -1) ]) # Define independent and dependent variables x_array = data.iloc[:, data. columns != 'forecasted_total_principal'].values y_array = data.iloc[:, data.columns == 'forecasted_total_principal'].values # Deploy model # Assign version counter version_counter = len(temp_dataset_td_file) + intermediate_horizon # Define X to to deploy model for real forecasts x_real = series_to_supervised(temp_dataset_td_file, n_in=window_size, dropnan=False) x_real = x_real.drop(columns=[ 'total_principal(t-%s)' % (i) for i in range(window_size, 0, -1) ]) x_real = x_real.iloc[-1, :].values x_real = x_real.reshape(1, -1) # Make real forecasts regressor = create_regressor(regressor_param, x_array, y_array) if regressor is -1: return -1 y_pred = regressor.predict(x_real) # Fill list with forecasts temp_forecasts_dict = { 'version': version_counter, 'value': float(y_pred[0]) } temp_list_forecasts.append(temp_forecasts_dict) # Fill list with forecasts temp_file_forecasts_dict = { file_instance['file_name']: temp_list_forecasts } list_forecasts.append(temp_file_forecasts_dict) # If the model will return also ground truth values if ground_truth_param == 'yes': temp_list_ground_truth = [] # Fill dataframe with ground thruth for intermediate_horizon in range( 0, len(temp_dataset_td_file['total_principal'])): temp_ground_truth_dict = { 'version': intermediate_horizon + 1, 'value': float(temp_dataset_td_file['total_principal'] [intermediate_horizon]) } temp_list_ground_truth.append(temp_ground_truth_dict) # Fill list with files temp_ground_truth_dict = { file_instance['file_name']: temp_list_ground_truth } list_ground_truth.append(temp_ground_truth_dict) # Fill results dictionary with change proneness and TD change proneness for each file dict_result['change_metrics'] = list_metrics # Fill results dictionary with forecasts for each file dict_result['forecasts'] = list_forecasts # If the model will return also ground truth values if ground_truth_param == 'yes': # Fill results dictionary with ground thruth dict_result['ground_truth'] = list_ground_truth if debug: print(dict_result) return dict_result
def build_and_train_td(horizon_param, project_param, regressor_param, ground_truth_param): """ Build TD forecasting models and return forecasts for an horizon specified by the user. Arguments: horizon_param: The forecasting horizon up to which forecasts will be produced. project_param: The project for which the forecasts will be produced. regressor_param: The regressor models that will be used to produce forecasts. ground_truth_param: If the model will return also ground truth values or not. Returns: A dictionary containing forecasted values (and ground thruth values if ground_truth_param is set to yes) for each intermediate step ahead up to the specified horizon. """ # selecting indicators that will be used as model variables metrics_td = [ 'code_smells', 'ncloc', 'complexity', 'duplicated_blocks', 'sqale_index', 'reliability_remediation_effort', 'security_remediation_effort' ] # Select sliding window length window_size = 2 # Read dataset try: dataset_td = pd.read_csv('data/%s.csv' % project_param, sep=";", usecols=metrics_td) except FileNotFoundError as e: if debug: print(e) return -2 dataset_td['total_principal'] = dataset_td[ 'reliability_remediation_effort'] + dataset_td[ 'security_remediation_effort'] + dataset_td['sqale_index'] dataset_td = dataset_td.drop(columns=[ 'sqale_index', 'reliability_remediation_effort', 'security_remediation_effort' ]) # Initialise variables dict_result = { 'parameters': { 'project': project_param, 'horizon': horizon_param, 'regressor': regressor_param, 'ground_truth': ground_truth_param } } list_forecasts = [] list_ground_truth = [] # Make forecasts using the Direct approach, i.e. train separate ML models for each forecasting horizon for intermediate_horizon in range(1, horizon_param + 1): if debug: print( '=========================== Horizon: %s ============================' % intermediate_horizon) # Add time-shifted prior and future period data = series_to_supervised(dataset_td, n_in=window_size) # Append dependend variable column with value equal to total_principal of the target horizon's version data['forecasted_total_principal'] = data['total_principal(t)'].shift( -intermediate_horizon) data = data.drop(data.index[-intermediate_horizon:]) # Remove TD as independent variable data = data.drop(columns=[ 'total_principal(t-%s)' % (i) for i in range(window_size, 0, -1) ]) # Define independent and dependent variables x_array = data.iloc[:, data. columns != 'forecasted_total_principal'].values y_array = data.iloc[:, data.columns == 'forecasted_total_principal'].values # Deploy model # Assign version counter version_counter = len(dataset_td) + intermediate_horizon # Define X to to deploy model for real forecasts x_real = series_to_supervised(dataset_td, n_in=window_size, dropnan=False) x_real = x_real.drop(columns=[ 'total_principal(t-%s)' % (i) for i in range(window_size, 0, -1) ]) x_real = x_real.iloc[-1, :].values x_real = x_real.reshape(1, -1) # Make real forecasts regressor = create_regressor(regressor_param, x_array, y_array) if regressor is -1: return -1 y_pred = regressor.predict(x_real) # Fill dataframe with forecasts temp_dict = {'version': version_counter, 'value': float(y_pred[0])} list_forecasts.append(temp_dict) # Fill results dictionary with forecasts dict_result['forecasts'] = list_forecasts # If the model will return also ground truth values if ground_truth_param == 'yes': # Fill dataframe with ground thruth for intermediate_horizon in range(0, len(dataset_td['total_principal'])): temp_dict = { 'version': intermediate_horizon + 1, 'value': float(dataset_td['total_principal'][intermediate_horizon]) } list_ground_truth.append(temp_dict) # Fill results dictionary with ground thruth dict_result['ground_truth'] = list_ground_truth if debug: print(dict_result) return dict_result