def load_rf_data(cur_path): data_folder = "data\\titanic" processed_data_folder = os.path.join(cur_path, data_folder) # Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess # how well the model performed. data_file_path = os.path.join(processed_data_folder, "train.csv") data = DataProcessor(data_file_path, processed_data_folder) try: #Try to load data data.load_processed_data() except FileNotFoundError: #No data found, so process it # 10% test, 10% validation, 80% training samples from data splits = (0.1, 0.1, 0.8) # Only use certain columns use_cols = ( # 0, #PassengerID 1, # Survived 2, # Pclass # 3, #Name 4, # Sex 5, # Age 6, # SibSp 7, # Parch # 8, #Ticket 9, # Fare # 10, #Cabin 11, # Embarked ) # Mark features as categorical (so we can one-hot-encode them later) # categorical_cols = () categorical_cols = (2, # Pclass 4, # Sex 11 # Embarked ) # Convert certain columns to float values (so we can use numpy arrays) converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex], 11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]} data.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols, converters=converters, filter_missing=True) return data
# 8, #Ticket 9, # Fare # 10, #Cabin # 11, # Embarked ) # Mark features as categorical (so we can one-hot-encode them later) # categorical_cols = () categorical_cols = (2, # Pclass 4, # Sex 11 # Embarked ) # Convert certain columns to float values (so we can use numpy arrays) converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex], 11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]} filter_missing = True data_processor.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols, converters=converters, filter_missing=filter_missing) if 'train' in sys.argv: # Extract training data, initialize neural network (train_x, train_y) = (data_processor.training_X, data_processor.training_y) train_y = np.array([[0, 1] if train_y[i] == 1 else [1, 0] for i in range(len(train_y))]) (valid_x, valid_y) = (data_processor.validation_X, data_processor.validation_y) valid_y = np.array([[0, 1] if valid_y[i] == 1 else [1, 0] for i in range(len(valid_y))]) if 'easydata' in sys.argv: train_x = np.array([[i/1000, 2*i/1000] for i in range(100)]) train_y = np.array([(train_x[i][0] + train_x[i][1])/1000 for i in range(100)]) valid_x, valid_y = train_x, train_y # Note: validation is on training set for easy data print('Loading neural network...') # Set NN params input_dimension = len(train_x[0]) output_dimension = len(train_y[0]) if isinstance(train_y[0], (str, list, tuple, np.ndarray)) else 1