예제 #1
0
              input_dim=13,
              kernel_initializer='uniform',
              activation='linear'))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = 'adam'
    model.compile(loss='mean_squared_error',
                  optimizer=optimizer,
                  metrics=['mse', 'mae'])
    return model


collection_mse = []
collection_mae = []
for i in neurons:
    model = create_model(neurons=i)
    estimator = model.fit(x_train, y_train, epochs=100, verbose=2)
    score = model.evaluate(x_test, y_test, batch_size=20, verbose=1)
    print("\nTest score:", score[0])
    print('Test accuracy:', score[1])
    pyplot.plot(estimator.history['mean_squared_error'])
    pyplot.plot(estimator.history['mean_absolute_error'])
    pyplot.title('Combined statistics with ' + str(i) +
                 ' neaurons - MSE & MAE')
    pyplot.xlabel('Number of epoch')
    pyplot.show()

#test_mse_score, test_mae_score = model.evaluate(x_test, y_test, batch_size =20, verbose =1)
#    print(test_mse_score)
#    collection_mse.append(test_mse_score)
#    collection_mae.append(test_mae_score)
                callbacks = [early_stopping]
            model.fit(X_train,
                      y_train,
                      batch_size=model.batch_size,
                      nb_epoch=epochs,
                      validation_split=0.10,
                      callbacks=[early_stopping, tensorboard])
            fit_models.append(model)

        predictions = [
            dataload.predict_sequences_multiple(model, X_test, seq_len,
                                                predict_len)
            for model in top_models
        ]
        scores = [
            model.evaluate(X_test, y_test, verbose=0) for model in top_models
        ]

        # Save results
        os.makedirs(results_fname)
        folder_name = 'seq_len_{}'.format(seq_len)
        os.makedirs('{}/{}'.format(results_fname, folder_name))
        results.to_csv('{0}/{1}/results.csv'.format(results_fname,
                                                    folder_name))
        top_model_plots = [(predictions[i], 'Model {}'.format(i + 1))
                           for i in range(len(predictions))]
        plot_results_multiple(top_model_plots,
                              y_test,
                              predict_len,
                              fig_path='{0}/{1}/plots.pdf'.format(
                                  results_fname, folder_name))
예제 #3
0
    model.add(layers.Dense(neurons, activation='relu', 
        kernel_constraint=maxnorm(weight_constraint)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(neurons, activation='relu', 
        kernel_constraint=maxnorm(weight_constraint)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(1))
    model.compile(optimizer=optim, loss='mse', metrics=['mae', 'mse'])
    return model



# Training the final model
model = grid_model5(optim=adam, dropout_rate=0.1, weight_constraint=4, neurons=30)
model.fit(X_train, y_train, epochs=200, batch_size=30, verbose=0)
loss, test_mse_score, test_mae_score = model.evaluate(X_test, y_test)

# Save fitted model to a file

#NOT USED
#
## General Model
#def gen_model(capacity, optim='rmsprop'):
#    model = models.Sequential()
#    model.add(layers.Dense(capacity, activation='relu',
#        input_shape=(X_train.shape[1], )))
#    model.add(layers.Dense(capacity, activation='relu'))
#    model.add(layers.Dense(1))
#    model.compile(optimizer=optim, loss='mse', metrics=['mae', 'mse'])
#    return model
#
            if (i == 0):
                callbacks = [early_stopping]
            else:
                callbacks = [early_stopping]
            model.fit(
                    X_train,
                    y_train,
                    batch_size=model.batch_size,
                    nb_epoch=epochs,
                    validation_split=0.10,
                    callbacks=[early_stopping, tensorboard]
                    )
            fit_models.append(model)

        predictions = [dataload.predict_sequences_multiple(model, X_test, seq_len, predict_len)
                for model in top_models]
        scores = [model.evaluate(X_test, y_test, verbose=0)
                for model in top_models]

        # Save results
        os.makedirs(results_fname)
        folder_name = 'seq_len_{}'.format(seq_len)
        os.makedirs('{}/{}'.format(results_fname, folder_name))
        results.to_csv('{0}/{1}/results.csv'.format(results_fname, folder_name))
        top_model_plots = [(predictions[i], 'Model {}'.format(i+1)) for i in range(len(predictions))]
        plot_results_multiple(top_model_plots, y_test, predict_len, fig_path = '{0}/{1}/plots.pdf'.format(results_fname, folder_name))
        index = 1
        for model in fit_models:
            model.save('{}/{}/model-{}.h5'.format(results_fname, folder_name, index))
            index = index + 1
예제 #5
0
def main(_neurons,
         _activationFunctionHidden,
         _activationFunctionOutput,
         _lossFunction,
         _batchSize,
         _learningRate,
         _numberOfEpochs,
         _writeToCSV=False,
         _hyperparameterTuning=False):

    dataset = np.loadtxt("FM_dataset.dat")

    #######################################################################
    #                       ** START OF YOUR CODE **
    #######################################################################

    input_dim = 3  # CONSTANT: Stated in specification

    #shuffle the data
    np.random.shuffle(dataset)

    # Separate data columns into x (input features) and y (output)
    x = dataset[:, :input_dim]
    y = dataset[:, input_dim:]

    split_idx = int(0.8 * len(x))

    # Split data by rows into a training set and a validation set
    x_train = x[:split_idx]
    y_train = y[:split_idx]
    x_val = x[split_idx:]
    y_val = y[split_idx:]

    # Apply preprocessing to the data
    x_prep_input = Preprocessor(x_train)
    y_prep_input = Preprocessor(y_train)

    x_train_pre = x_prep_input.apply(x_train)
    y_train_pre = y_prep_input.apply(y_train)

    x_val_pre = x_prep_input.apply(x_val)
    y_val_pre = y_prep_input.apply(y_val)

    # fix random seed for reproducibility
    seed = 7
    np.random.seed(seed)

    if _hyperparameterTuning == True:

        #create model
        model = KerasRegressor(build_fn=create_model,
                               nb_epoch=_numberOfEpochs,
                               batch_size=_batchSize)

        # Use scikit-learn to grid search
        batch_size = [32]
        epochs = [100, 250, 500, 1000]  #10, 100, 250, 500, 1000?
        learn_rate = [1e-3]
        neurons = [5]
        hidden_layers = [3]
        #activation =  ['relu', 'sigmoid'] #tanh

        #optimizer = [ 'SGD', 'RMSprop', 'Adam']
        #dropout_rate = [0.0, 0.5, 0.9]

        param_grid = dict(epochs=epochs,
                          batch_size=batch_size,
                          learn_rate=learn_rate,
                          neurons=neurons,
                          hidden_layers=hidden_layers)

        #perform grid search with 10-fold cross validation
        grid = GridSearchCV(estimator=model,
                            param_grid=param_grid,
                            n_jobs=-1,
                            cv=5)

        grid_result = grid.fit(x_train_pre, y_train_pre)

        #summarize results of hyperparameter search
        print("Best: %f using %s" %
              (grid_result.best_score_, grid_result.best_params_))
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))

        #extract the best model
        best_model = grid.best_estimator_.model

        #Evaluate the best model
        preds = best_model.predict(x_val_pre)
        targets = y_val_pre
        mse = evaluate_architecture(targets, preds)
        print("Mean squared error of best model:", mse)

        #save the best model
        filename = 'trained_FM.pickle'
        pickle.dump(best_model, open(filename, 'wb'))

    else:

        model = create_model()
        history = model.fit(x_train_pre,
                            y_train_pre,
                            batch_size=_batchSize,
                            epochs=numberOfEpochs,
                            verbose=1,
                            validation_data=(x_val_pre, y_val_pre))

        #model.fit(x_train_pre,y_train_pre)
        score = model.evaluate(x_val_pre, y_val_pre, verbose=0)
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])

    #predict hidden dataset using best model
    predictions = predict_hidden(dataset)
    print(predictions)
def run_lstm(airline):
    # parameters of LSTM
    TRAIN_SPLIT = 364*24 
    BATCH_SIZE = 24 
    BUFFER_SIZE = 5000
    EVALUATION_INTERVAL = 364/BATCH_SIZE 
    EPOCHS = 200
    accuracy_threshold = 0.25
    
    n_iter_search = 16 # Number of parameter settings that are sampled.
    
    # Parameters to evaluate in order to find best parameters for each model
    optimizers = ['rmsprop', 'adam', 'adadelta'] 
    init = ['glorot_uniform', 'normal', 'uniform']
    EPOCHS = np.array([100, 200, 500])
    param_grid = dict(optimizer=optimizers, nb_epoch=EPOCHS, init=init)
    
    
    # past history valid options: 24-future_target,2(24)-future_target,3(24)-future_target,...
    past_history = 8
    # future target valid options: 12,11,10,...
    future_target = 24 - past_history
    # note that 'past history' + 'future target' must be a multiple of 24, i.e. 24, 48, 72, ...
    
    # this is not relevant in our problem, it is always 1 (to make a prediction for each hour)
    STEP = 1 
    
    # Map airline to airline name
    airline_name = airlines_cv[airlines_cv['Marketing_Airline_Network']==airline]['Airline'].reset_index()
    airline_name = airline_name['Airline'][0]
    
    # select an airline from the dataset (for example: AA)
    data = full_data[full_data['Marketing_Airline_Network'] == airline]
    data.columns
    
    # total delay for all airlines
    df = data[['Date','Hour','Weekday','ArrDelay3AM','DepDelay3AM']].copy()
    
    # Create variables for seasons and holidays
    df['season'] = pd.to_datetime(df['Date']).dt.quarter
    dates = df['Date'].values
    holiday = np.empty(dates.shape[0])
    for i in range(0,dates.shape[0]):
        if dt.datetime.strptime(dates[i],'%Y-%m-%d') in holidays.US():
            holiday[i] = 1
        else:
            holiday[i] = 0
    df['holiday'] = holiday
    
    df = df[df['Date'] != '2018-03-11']
    #df = df[df['Date'] != '2019-03-10']
    
    
    # don't change the seed so that we can compare the results with each other
    tf.random.set_seed(13)
    #tf.set_random_seed(13) # use this instead depending on version of TensorFlow
    
    #creating time steps
    def create_time_steps(length):
      return list(range(-length, 0))
    
    #def for plotting
    def show_plot(plot_data, delta, title):
      labels = ['History', 'True Future', 'Model Prediction']
      marker = ['.-', 'rx', 'go']
      time_steps = create_time_steps(plot_data[0].shape[0])
      if delta:
        future = delta
      else:
        future = 0
    
      plt.title(title)
      for i, x in enumerate(plot_data):
        if i:
          plt.plot(future, plot_data[i], marker[i], markersize=10,
                   label=labels[i])
        else:
          plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
      plt.legend()
      plt.xlim([time_steps[0], (future+5)*2])
      plt.ylabel('Total Delay (min)')
      plt.xlabel('Time-Step')
      return plt
    
    #def for baseline
    def baseline(history):
      return np.mean(history)
    
    
    ######## multivariate
    features_considered = ['ArrDelay3AM','DepDelay3AM','Weekday','season','holiday']
    features = df[features_considered]
    features.index = df[['Date','Hour']]
    features.head()
    
    dataset = features.values
    data_mean = dataset[:TRAIN_SPLIT].mean(axis=0)
    data_std = dataset[:TRAIN_SPLIT].std(axis=0)
    
    for i in range(0,2):
        dataset[:,i] = (dataset[:,i]-data_mean[i])/data_std[i]
    
    def multivariate_data(dataset, target, start_index, end_index, history_size,
                          target_size, step, single_step=False):
      data = []
      labels = []
    
      start_index = start_index + history_size
      if end_index is None:
        end_index = len(dataset)
    
      for i in range(start_index, end_index, 24):
        indices = range(i-history_size, i, step)
        data.append(dataset[indices])
    
        if single_step:
          labels.append(target[i+target_size-1]) #added -1
        else:
          labels.append(target[i:i+target_size])
    
      return np.array(data), np.array(labels)
    
    #def for plotting the error
    def plot_train_history(history, title):
      loss = history.history['loss']
      val_loss = history.history['val_loss']
    
      epochs = range(len(loss))
    
      plt.figure()
    
      plt.plot(epochs, loss, 'b', label='Training loss')
      plt.plot(epochs, val_loss, 'r', label='Validation loss')
      plt.xlabel('Epoch')
      plt.ylabel('Mean Absolute Error')
      plt.title(title)
      plt.legend()
    
      plt.show()
    
    #multivariate_data(dataset, target, start_index, end_index, history_size,target_size, step, single_step=False)
    #preparing the dataset
    x_train_multi, y_train_multi = multivariate_data(dataset, dataset[:, 0], 0,
                                                     TRAIN_SPLIT, past_history,
                                                     future_target, STEP)
    x_val_multi, y_val_multi = multivariate_data(dataset, dataset[:, 0],
                                                 TRAIN_SPLIT, None, past_history,
                                                 future_target, STEP)
    
    print ('Single window of past history : {}'.format(x_train_multi[0].shape))
    print ('Target delay to predict : {}'.format(y_train_multi[0].shape))
    
    
    #definition for multi step plot - this shows the predictions for an individual day
    def multi_step_plot(history, true_future, prediction):
      #plt.figure(figsize=(12, 6))
      plt.figure(figsize=(8, 6))
      num_in = create_time_steps(len(history))
      num_out = len(true_future)
    
      plt.plot(num_in, np.array(history[:, 0]*data_std[0]+data_mean[0]), label='History')
      plt.plot(np.arange(num_out)/STEP, np.array(true_future)*data_std[0]+data_mean[0],color='black',
               label='True Future')
      if prediction.any():
        plt.plot(np.arange(num_out)/STEP, np.array(prediction)*data_std[0]+data_mean[0], color='red', ls='dashed',
                 label='Predicted Future')
      plt.legend(loc='upper left')
      plt.xlabel('Time of Day')
      plt.xticks(range(-past_history+2,future_target,5),range(2,24,5))
      plt.ylabel('Cumulative Delay (Minute)')
      plt.show()
    
    #train
    train_data_multi = tf.data.Dataset.from_tensor_slices((x_train_multi, y_train_multi))
    train_data_multi = train_data_multi.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
    
    #validation
    val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
    val_data_multi = val_data_multi.batch(BATCH_SIZE).repeat()
    
    # Create model for gridsearch analysis of different parameters
    def create_model(BUFFER_SIZE=BUFFER_SIZE,optimizer='rmsprop', init='glorot_uniform'):
        
        #Building the LSTM model
        multi_step_model = tf.keras.models.Sequential()
        multi_step_model.add(tf.keras.layers.LSTM(32,
                                                  return_sequences=True,
                                                  input_shape=x_train_multi.shape[-2:]))
        multi_step_model.add(tf.keras.layers.LSTM(16, activation='relu'))
        multi_step_model.add(tf.keras.layers.Dense(25))
        multi_step_model.add(tf.keras.layers.Dense(future_target))
        
        multi_step_model.compile(optimizer=optimizer, loss='mean_squared_error',metrics=["mse","mae"])
        return multi_step_model
    
    # Gridsearch analysis to evaluate each of the parameters
    multi_step_model = KerasRegressor(build_fn=create_model)
    random_search = RandomizedSearchCV(estimator=multi_step_model, 
                                       param_distributions=param_grid,
                                       n_iter=n_iter_search)
    random_search.fit(x_train_multi, y_train_multi)
    print("Best: %f using %s" % (random_search.best_score_, random_search.best_params_))
    # Create dataframe with best parameters
    parameters = pd.DataFrame(random_search.best_params_, index=[0])
    parameters['Airline']=airline
    parameters = parameters[['Airline','optimizer','nb_epoch','init']]
    optimizer = parameters['optimizer'][0]
    EPOCHS = parameters['nb_epoch'][0]
    init = parameters['init'][0]
    
    
    # Table of best parameters and performance
    def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                         header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                         bbox=[0, 0, 1, 1], header_columns=0,
                         ax=None, **kwargs):
        if ax is None:
            size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
            fig, ax = plt.subplots(figsize=size)
            plt.title('%s Best Parameters & Performance' % airline_name,fontdict=dict(fontsize=16,fontweight='bold'),loc='center')
            ax.axis('off')
    
        mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, loc='center',cellLoc='center',**kwargs)
    
        mpl_table.auto_set_font_size(False)
        mpl_table.set_fontsize(font_size)
    
        for k, cell in  six.iteritems(mpl_table._cells):
            cell.set_edgecolor(edge_color)
            if k[0] == 0 or k[1] < header_columns:
                cell.set_text_props(weight='bold', color='w')
                cell.set_facecolor(header_color)
            else:
                cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
        plt.savefig('%s_parameters_performance.png' % airline,bbox_inches='tight')
        return ax
    
    
    
    #Building the LSTM model using best model parameters
    multi_step_model = tf.keras.models.Sequential()
    multi_step_model.add(tf.keras.layers.LSTM(32,
                                              return_sequences=True,
                                              input_shape=x_train_multi.shape[-2:]))
    multi_step_model.add(tf.keras.layers.LSTM(16, activation='relu'))
    multi_step_model.add(tf.keras.layers.Dense(25))
    multi_step_model.add(tf.keras.layers.Dense(future_target))
    
    multi_step_model.compile(optimizer=optimizer, loss='mean_squared_error',metrics=["mse","mae"])
    
    
    
    
    for x, y in val_data_multi.take(1):
      print (multi_step_model.predict(x).shape)
    
    multi_step_history = multi_step_model.fit(train_data_multi, epochs=EPOCHS,
                                              steps_per_epoch=EVALUATION_INTERVAL,
                                              validation_data=val_data_multi,
                                              validation_steps=18)
    
    # plot training and validation loss
    plot_train_history(multi_step_history, 'Multi-Step Training and validation loss')
    
    # show sample results
    #rmse
    rmse = np.sqrt(multi_step_model.evaluate(x_val_multi,y_val_multi))
    print('RMSE: %s' % rmse)
    nrmse = rmse*data_std[0]
    print('NRMSE: %s' % nrmse)
    
    parameters['RMSE'] = rmse[0].round(3)
    parameters['NRMSE'] = nrmse[0].round(3)
    
    
    val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
    val_data_multi = val_data_multi.batch(1)
    
    #plotting the sample predictions
    for x, y in val_data_multi.take(1):
      multi_step_plot(x[0], y[0], multi_step_model.predict(x)[0])
    
    pred_data=pd.DataFrame([])
    est_date = dt.date(2019, 1, 1)
    # Consolidate true and predictions into a single dataframe
    for x, y in val_data_multi.take(365):
        true_val = y[:,15]*data_std[0]+data_mean[0]
        prediction = np.array(multi_step_model.predict(x)[:,15]*data_std[0]+data_mean[0])
        pred_data = pred_data.append(pd.DataFrame({'Date':est_date,'True Value':true_val,'Predicted Value':prediction}, index=[0]),ignore_index=True)
        est_date = est_date + timedelta(days=1) 
    pred_data['Predicted Value'] = round(pred_data['Predicted Value'])
    
    print(pred_data)
    
    # Begin labeling data with old labels to test accuracy
    labels = pd.read_csv('testing_cumulative_data_%s_labeled.csv' % airline)
    labels['Date'] = pd.to_datetime(labels['Date'])
    
    cluster = pred_data.copy()
    cluster['Predicted Cluster'] = ''
    cluster['True Cluster'] = ''
    
    cluster['True Cluster']=cluster['Date'].map(dict(zip(labels['Date'],labels['Cluster_Num'])))
    
    # Find cutoffs in order to label data based on predictions and compare with actual clusters
    cutoffs = pd.DataFrame([])
    for i in cluster['True Cluster'].unique():
        data = cluster[cluster['True Cluster']==i]
        val = data['True Value'].min()
        cutoffs = cutoffs.append(pd.DataFrame({'Cluster':i,'Cutoffs':val}, index=[0]),ignore_index=True)
    cutoffs = cutoffs.sort_values(by=['Cluster']).reset_index()
    cutoffs = cutoffs[['Cluster','Cutoffs']]
    
    for i in cutoffs['Cluster'].unique():
        cluster.loc[cluster['Predicted Value']>cutoffs['Cutoffs'][i],'Predicted Cluster']=i
    cluster.loc[cluster['Predicted Value']<=cutoffs['Cutoffs'][1],'Predicted Cluster']=0
    
    meltdown_cutoff = max(cutoffs['Cutoffs'])
    # Plot scatter of predictions
    fig,ax = plt.subplots(figsize=(12,8))
    meltdown = plt.scatter(pred_data[pred_data['True Value']>=meltdown_cutoff]['True Value'],pred_data[pred_data['True Value']>=meltdown_cutoff]['Predicted Value'],color='blue')
    normal = plt.scatter(pred_data[pred_data['True Value']<meltdown_cutoff]['True Value'],pred_data[pred_data['True Value']<meltdown_cutoff]['Predicted Value'],color='gray')
    plt.axhline(y=meltdown_cutoff, color='r', linestyle='-')
    plt.legend((meltdown,normal),
               ('Meltdown', 'Normal'),
               scatterpoints=1,
               loc='upper left',
               ncol=1,
               fontsize=12)
    plt.title('LSTM Predictions',fontsize=16)
    plt.xlabel('True Values',fontsize=14)
    plt.ylabel('Predicted Values',fontsize=14)
    plt.ylim(ymin=0)  
    plt.xlim(xmin=0) 
    plt.axis('square')
    ax.plot([0, 1], [0, 1], transform=ax.transAxes)
    plt.savefig('%s_scatter_plot.png' % airline)
    
    #Define the function used to create a Confusion Matrix plot
    def plot_confusion_matrix(cm, classes,
                              normalize=False,
                              title='Confusion matrix',
                              cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')
    
        print(cm)
    
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title, fontdict = dict(fontsize=24))
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, fontsize = 18, rotation=45)
        plt.yticks(tick_marks, classes, fontsize = 18)
    
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     fontsize = 20,
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        plt.grid(b=None)    
        plt.tight_layout()
        plt.ylabel('True label', fontsize = 18)
        plt.xlabel('Predicted label', fontsize = 18)
    
    y_test = cluster['True Cluster']
    y_pred = cluster['Predicted Cluster']
    
    # Model Accuracy, how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    
    parameters['Cluster Acc.']=metrics.accuracy_score(y_test,y_pred).round(3)
    
    
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    classificationReport = classification_report(y_test, y_pred)
    cr_lines = classificationReport.split('/n')
    cr_aveTotal = cr_lines[len(cr_lines) - 2].split()
    ave_recall = float(cr_aveTotal[len(cr_aveTotal) - 3])

    parameters['Cluster Recall'] = ave_recall

    def plot_classification_report(cr, title='Classification Report ', with_avg_total=False, cmap=plt.cm.Blues):
        lines = cr.split('\n')
        classes = []
        plotMat = []
        for line in lines[2 : (len(lines) - 3)]:         #print(line)
            t = line.split()         # print(t)         
            if(len(t)==0):      
                break
            classes.append(t[0])
            v = [float(x) for x in t[1: len(t) - 1]]
            print(v)
            plotMat.append(v)
        if with_avg_total:
            aveTotal = lines[len(lines) - 2].split()
            classes.append('avg/total')
            vAveTotal = [float(x) for x in aveTotal[2:len(aveTotal) - 1]]
            plotMat.append(vAveTotal)

        plt.figure()
        plt.imshow(plotMat, interpolation='nearest', cmap=cmap)
        plt.title(title, fontsize=16)
        plt.colorbar()
        x_tick_marks = np.arange(3)
        y_tick_marks = np.arange(len(classes))
        plt.xticks(x_tick_marks, ['Precision', 'Recall', 'F1-Score'])
        plt.yticks(y_tick_marks, classes)
        plt.grid(b=None)
        plt.tight_layout()
        plt.ylabel('Classes', fontsize=14)
        plt.xlabel('Measures', fontsize=14)
        plt.savefig('%s_classif_report.png' % airline, bbox_inches='tight')
    
    plot_classification_report(classificationReport, with_avg_total=True)
    
    
    
    #Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)
    
    #Create labels that correspond with our respective cluster labels
    if max(cluster['True Cluster'])==2:
        labs=['Good','Normal','Meltdown']
    if max(cluster['True Cluster'])==3:
        labs=['Good','Normal','Bad','Meltdown']
    if max(cluster['True Cluster'])==4:
        labs=['Great', 'Good','Normal','Bad','Meltdown']
    if max(cluster['True Cluster'])==5:
        labs=['Great', 'Good','Normal','Bad','Very Bad','Meltdown']
    
    #Plot non-normalized confusion matrix to show counts of predicted vs. actual clusters
    plt.figure()
    plt.grid(b=None)
    plot_confusion_matrix(cnf_matrix, classes=labs,
                          title='Confusion matrix, without normalization')
    plt.savefig('%s_confusion_matrix_count.png' % airline)
    
    #Plot normalized confusion matrix to show percentage of classifications in predicted vs. actual clusters
    plt.figure()
    plt.figure(figsize=(11,7))
    plot_confusion_matrix(cnf_matrix, classes=labs, normalize=True,
                          title='%s LSTM Model \nNormalized Confusion Matrix' % airline_name)
    plt.grid(b=None)
    plt.savefig('%s_confusion_matrix.png' % airline)
 
    plt.figure()       
    render_mpl_table(parameters, header_columns=0, col_width=2.0)
예제 #7
0
파일: final.py 프로젝트: choi0/talkingdata
def main():
    """ Main entry point of the app """

    #I/O files
    masterTrainingData = "train_sample.csv"
    masterTestData = "test.csv"
    sampleTrainingData = "train_sample_10000.csv"
    submissionTemplate = "sample_submission.csv"
    submissionOutput = "mySubmission.csv"
    
    # fix random seed for reproducibility
    seed = 69
    np.random.seed(seed)

    #load training data from csv
    dataframe = pd.read_csv(masterTrainingData, header=0)

    # split into input (X) and output (Y) variables
    x_train_master, y_train_master = preprocessTraining(dataframe);

    #print(x_train)
    #print(len(x_train))
    #print(y_train)
    
    #downsample to avoid unbalanced data
    dataframe_train_neg = dataframe[(dataframe['is_attributed'] == 0)]
    dataframe_train_pos = dataframe[(dataframe['is_attributed'] == 1)]
    print(len(dataframe_train_neg))
    print(len(dataframe_train_pos))

    dataframe_train_neg_sample = dataframe_train_neg.sample(n=4000)
    dataframe_train_pos_sample = dataframe_train_pos.sample(n=227)
    
    print(len(dataframe_train_neg_sample))
    print(len(dataframe_train_pos_sample))

    dataframe_train_comb = pd.concat([dataframe_train_neg_sample,dataframe_train_pos_sample])
    x_train, y_train = preprocessTraining(dataframe_train_comb)
    
    #submission = pd.read_csv(submissionTemplate)
    #submission['is_attributed'] = y_predss
    #submission.to_csv(submissionOutput, index=False)
    #print(submission.head())

    # create model
    #model = Sequential()
    #model.add(Dense(6, input_dim=9, kernel_initializer='normal', activation='relu'))
    #model.add(Dense(3, kernel_initializer='normal', activation='relu'))
    #model.add(Dense(1, kernel_initializer='normal', activation='tanh'))
    # Compile model
    #model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy', 'sparse_categorical_accuracy'])
    #model.fit(x_train, y_train, epochs=8, batch_size=64)
    
    model = KerasRegressor(build_fn = create_model)
    layers = [[5], [6,3], [6,5,4,3]]
    activations = [relu, sigmoid]
    param_grid = dict(layers = layers, activation = activations, batch_size = [32, 64], epochs = [4])
    grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'neg_mean_squared_error')

    grid_result = grid.fit(x_train, y_train)
    print(grid_result.best_score_)
    print(grid_result.best_params_)
    #[grid_result.best_score_, grid_result.best_params_]

    model.fit(x_train, y_train)
    results = model.predict(x_train)
    results = np.where(results > 0.5, 1, 0)

    negCount = 0
    posCount = 0
    for i in range(0, len(results)):
        if results[i][0] == 1:
            posCount += 1
        else:
            negCount += 1
    print(negCount)
    print(posCount)
    
    score = model.evaluate(x_train_master, y_train_master, batch_size=128)
    print(score)
    
    #test on full training data
    results = model.predict(x_train_master)
    results = np.where(results > 0.5, 1, 0)
    
    negCount = 0
    posCount = 0
    for i in range(0, len(results)):
        if results[i][0] == 1:
            posCount += 1
        else:
            negCount += 1
    print(negCount)
    print(posCount)
    false_positive_rate, recall, thresholds = roc_curve(y_train_master, results)
    roc_auc = auc(false_positive_rate, recall)
    print(roc_auc)
예제 #8
0
파일: train_1.py 프로젝트: yeahydq/DYDL
                      shuffle=shuffle,
                      validation_split=validation_split)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
    grid_result = grid.fit(encoded_X_train, Y_train)
    # summarize results
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
else:
    model = create_model(neurons=100,
                         optimizer='rmsprop',
                         init='glorot_uniform')
    # model.fit(train_data, train_label, batch_size=20, epochs=100, shuffle=True, verbose=1, validation_split=0.2)
    model.fit(encoded_X_train,
              Y_train,
              batch_size=10,
              epochs=150,
              shuffle=True,
              verbose=1,
              validation_split=0.2)
    result = model.evaluate(encoded_X_test, Y_test, batch_size=1000)

    print('loss:%5.6f   acct:%5.6f' % (result[0], result[1]))

    # Save the trained model to disk
    model.save(MODEL_FILENAME)
    print(model.predict(encoded_X_train))
예제 #9
0
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
search = RandomizedSearchCV(model, hyperparameters, cv=4)
search.fit(x_train, y_train)
print(search.best_params_)

# acc = search.score(x_test, y_test, verbose=0)
print(search.best_params_)

###########################################
model

model.compile(loss='mae', optimizer='rmsprop', metrics=['mae'])
model.fit(x_train, y_train, epochs=10, batch_size=5)

loss, mae = model.evaluate(x_test, y_test)

y_pred = model.predict(test1)
print(y_pred)

# # print("mae: ", mae)

# # submission = pd.DataFrame({
# #     "PassengerId": test_[:,0].astype(int),
# #     "Survived": y_pred
# # })

# a = np.arange(10000,20000)
# y_pred = pd.DataFrame(y_pred,a)
# y_pred.to_csv('./data/dacon/comp1/sample_submission.csv', index = True, header=['hhb','hbo2','ca','na'],index_label='id')