def load_rf_data(cur_path):
    data_folder = "data\\titanic"
    processed_data_folder = os.path.join(cur_path, data_folder)
    # Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess
    #       how well the model performed.
    data_file_path = os.path.join(processed_data_folder, "train.csv")
    data = DataProcessor(data_file_path, processed_data_folder)

    try:
        #Try to load data
        data.load_processed_data()
    except FileNotFoundError:
        #No data found, so process it
        # 10% test, 10% validation, 80% training samples from data
        splits = (0.1, 0.1, 0.8)
        # Only use certain columns
        use_cols = (  # 0, #PassengerID
            1,  # Survived
            2,  # Pclass
            # 3, #Name
            4,  # Sex
            5,  # Age
            6,  # SibSp
            7,  # Parch
            # 8, #Ticket
            9,  # Fare
            # 10, #Cabin
            11,  # Embarked
        )
        # Mark features as categorical (so we can one-hot-encode them later)
        # categorical_cols = ()
        categorical_cols = (2,  # Pclass
                            4,  # Sex
                            11  # Embarked
                            )
        # Convert certain columns to float values (so we can use numpy arrays)
        converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex],
                      11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]}
        data.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols, converters=converters,
                          filter_missing=True)
    return data
예제 #2
0
                    # 8, #Ticket
                    9,  # Fare
                    # 10, #Cabin
                    # 11,  # Embarked
    )
    # Mark features as categorical (so we can one-hot-encode them later)
    # categorical_cols = ()
    categorical_cols = (2,  # Pclass
                        4,  # Sex
                        11  # Embarked
    )
    # Convert certain columns to float values (so we can use numpy arrays)
    converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex],
                  11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]}
    filter_missing = True
    data_processor.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols,
                                converters=converters, filter_missing=filter_missing)
if 'train' in sys.argv:
    # Extract training data, initialize neural network
    (train_x, train_y) = (data_processor.training_X, data_processor.training_y)
    train_y = np.array([[0, 1] if train_y[i] == 1 else [1, 0] for i in range(len(train_y))])
    (valid_x, valid_y) = (data_processor.validation_X, data_processor.validation_y)
    valid_y = np.array([[0, 1] if valid_y[i] == 1 else [1, 0] for i in range(len(valid_y))])
    if 'easydata' in sys.argv:
        train_x = np.array([[i/1000, 2*i/1000] for i in range(100)])
        train_y = np.array([(train_x[i][0] + train_x[i][1])/1000 for i in range(100)])
        valid_x, valid_y = train_x, train_y  # Note: validation is on training set for easy data
    print('Loading neural network...')

    # Set NN params
    input_dimension = len(train_x[0])
    output_dimension = len(train_y[0]) if isinstance(train_y[0], (str, list, tuple, np.ndarray)) else 1