def predict(net, labels, files, params): print('starting inference') device = torch.device(params.device) predictions = [] probs = [] for i, file in enumerate(files): filename = os.path.splitext(os.path.basename(file))[0] processed = filename + '_proc.wav' pre.preprocess(file, processed) data = vggish_input.wavfile_to_examples(processed) data = torch.from_numpy(data).unsqueeze(1).float() data = data.to(device) net.to(device) out = net(data) # # for each spectrogram/row index of max probability # pred = np.argmax(out.detach().cpu().numpy(), axis=1) # # find most frequent index over all spectrograms # consensus = np.bincount(pred).argmax() # print('file {} sounds like a {} to me'.format(i, labels[consensus])) # mean probabilities for each col/class over all spectrograms mean_probs = np.mean(out.detach().cpu().numpy(), axis=0) # find index of max mean_probs idx = np.argmax(mean_probs) print('file {} sounds like a {} to me'.format(i, labels[idx])) print('my guesses are: ') for j, label in enumerate(labels): print('{0}: {1:.04f}'.format(label, mean_probs[j])) # predictions.append(labels[consensus]) predictions.append(labels[idx]) probs.append(mean_probs) os.remove(processed) return predictions, probs
def create_multiartifact_sample(self, depthmap): depthmaps = np.zeros((240, 180, 5)) for i, depthmap_path in enumerate(depthmap[0]): data, width, height, depth_scale, _max_confidence = preprocessing.load_depth( depthmap_path) depthmap = preprocessing.prepare_depthmap(data, width, height, depth_scale) depthmap = preprocessing.preprocess(depthmap) depthmaps[:, :, i] = tf.squeeze(depthmap, axis=2) depthmaps = tf.stack([depthmaps]) return depthmaps
def process_depthmaps(self): depthmaps = [] for artifact in self.artifacts: input_path = self.get_input_path(self.scan_directory, artifact['file']) data, width, height, depthScale, _max_confidence = preprocessing.load_depth( input_path) depthmap = preprocessing.prepare_depthmap(data, width, height, depthScale) depthmap = preprocessing.preprocess(depthmap) depthmaps.append(depthmap) depthmaps = np.array(depthmaps) return depthmaps
def main(data, k_folds): """Trains a new ML model and uploads it to S3.""" df = pd.read_csv(data, sep=";") df = preprocess(df) results = cross_validate_performance(df=df, n=k_folds) avg_results = average_evaluation_metrics(results) print(f"== Model Evaluation Results (Folds: {k_folds}) ==") for metric, value in avg_results.items(): print(f"{metric.upper()}: {value}") print(f"=================================================") X, y = Xy_split(df) model = WineQualityModel() model.fit(X, y) model.save(path=S3_PATH) return
def main( df_path: str = '/project/cq-training-1/project1/data/catalog.helios.public.20100101-20160101.pkl', image_size: int = 32, model: str = 'dummy', epochs: int = 20, optimizer: str = 'adam', lr: float = 1e-4, batch_size: int = 100, subset_perc: float = 1, subset_dates: bool = False, saved_model_dir: str = None, seq_len: int = 6, seed: bool = True, scale_label: bool = True, use_csky: bool = False, cache: bool = True, timesteps_minutes: int = 15): # Warning if no GPU detected if len(tf.config.list_physical_devices('GPU')) == 0: logger.warning('No GPU detected, training will run on CPU.') elif len(tf.config.list_physical_devices('GPU')) > 1: logger.warning( 'Multiple GPUs detected, training will run on only one GPU.') if subset_dates and subset_perc != 1: raise Exception( f'Invalid configuration. Argument --subset_dates=True and --subset_perc={subset_perc}.' ) # Set random seed if seed: tf.random.set_seed(SEED) np.random.seed(SEED) # Load dataframe logger.info('Loading and preprocessing dataframe...') df = pd.read_pickle(df_path) df = preprocessing.preprocess(df, shuffle=False, scale_label=scale_label) metadata = data.Metadata(df, scale_label) # Pre-crop data logger.info('Getting crops...') images = data.Images(metadata, image_size) # images.crop(dest=SLURM_TMPDIR) images.crop(dest=images.shared_storage) # Split into train and valid if subset_dates: metadata_train, metadata_valid = metadata.split_with_dates() else: metadata, _ = metadata.split(1 - subset_perc) metadata_train, metadata_valid = metadata.split(VALID_PERC) nb_train_examples = metadata_train.get_number_of_examples() nb_valid_examples = metadata_valid.get_number_of_examples() logger.info( f'Number of training examples : {nb_train_examples}, number of validation examples : \ {nb_valid_examples}') # Create model if model == 'dummy': model = baselines.DummyModel() elif model == 'sunset': model = baselines.SunsetModel() elif model == 'cnndem': model = baselines.ConvDemModel(image_size) elif model == 'sunset3d': model = baselines.Sunset3DModel() elif model == 'convlstm': model = baselines.ConvLSTM() elif model == 'cnngru': model = CnnGru(seq_len) elif model == 'cnngruatt': model = CnnGruAtt(seq_len) elif model == 'cnnlstm': model = LSTM_Resnet(seq_len) elif model == 'resnet': model = baselines.ResNetModel() else: raise Exception(f'Model "{model}" not recognized.') # Load model weights if saved_model_dir is not None: model.load_weights(os.path.join(saved_model_dir, "model")) # Loss and optimizer mse = tf.keras.losses.MeanSquaredError() if optimizer == 'adam': optimizer = tf.keras.optimizers.Adam(lr) elif optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD(lr) else: raise Exception(f'Optimizer "{optimizer}" not recognized.') # Create data loader dataloader_train = SequenceDataset( metadata_train, images, seq_len, batch_size, timesteps=datetime.timedelta(minutes=timesteps_minutes), cache=cache) dataloader_valid = SequenceDataset( metadata_valid, images, seq_len, batch_size, timesteps=datetime.timedelta(minutes=timesteps_minutes), cache=cache) # Training loop logger.info('Training...') losses = {'train': [], 'valid': []} best_valid_loss = float('inf') for epoch in range(epochs): train_epoch(model, dataloader_train, batch_size, mse, optimizer, nb_train_examples, scale_label, use_csky) test_epoch(model, dataloader_valid, batch_size, mse, nb_valid_examples, scale_label, use_csky) train_loss = np.sqrt(train_mse_metric.result().numpy()) valid_loss = np.sqrt(valid_mse_metric.result().numpy()) csky_valid_loss = np.sqrt(valid_csky_mse_metric.result().numpy()) if valid_loss < best_valid_loss: best_valid_loss = valid_loss utils.save_model(model) # Logs logger.info( f'Epoch {epoch} - Train Loss : {train_loss:.4f}, Valid Loss : {valid_loss:.4f}, Csky Valid Loss : \ {csky_valid_loss:.4f}') losses['train'].append(train_loss) losses['valid'].append(valid_loss) with train_summary_writer.as_default(): tf.summary.scalar('loss', train_loss, step=epoch) with test_summary_writer.as_default(): tf.summary.scalar('loss', valid_loss, step=epoch) # Plot losses plots.plot_loss(losses['train'], losses['valid'], csky_valid_loss)
def prepare_dataloader( dataframe: pd.DataFrame, target_datetimes: typing.List[datetime.datetime], stations: typing.Dict[typing.AnyStr, typing.Tuple[float, float, float]], target_time_offsets: typing.List[datetime.timedelta], config: typing.Dict[typing.AnyStr, typing.Any], ) -> tf.data.Dataset: """This function should be modified in order to prepare & return your own data loader. Note that you can use either the netCDF or HDF5 data. Each iteration over your data loader should return a 2-element tuple containing the tensor that should be provided to the model as input, and the target values. In this specific case, you will not be able to provide the latter since the dataframe contains no GHI, and we are only interested in predictions, not training. Therefore, you must return a placeholder (or ``None``) as the second tuple element. Reminder: the dataframe contains imagery paths for every possible timestamp requested in ``target_datetimes``. However, we expect that you will use some of the "past" imagery (i.e. imagery at T<=0) for any T in ``target_datetimes``, but you should NEVER rely on "future" imagery to generate predictions (for T>0). We will be inspecting data loader implementations to ensure this is the case, and those who "cheat" will be dramatically penalized. See https://github.com/mila-iqia/ift6759/tree/master/projects/project1/evaluation.md for more information. Args: dataframe: a pandas dataframe that provides the netCDF file path (or HDF5 file path and offset) for all relevant timestamp values over the test period. target_datetimes: a list of timestamps that your data loader should use to provide imagery for your model. The ordering of this list is important, as each element corresponds to a sequence of GHI values to predict. By definition, the GHI values must be provided for the offsets given by ``target_time_offsets`` which are added to each timestamp (T=0) in this datetimes list. stations: a map of station names of interest paired with their coordinates (latitude, longitude, elevation). target_time_offsets: the list of timedeltas to predict GHIs for (by definition: [T=0, T+1h, T+3h, T+6h]). config: configuration dictionary holding any extra parameters that might be required by the user. These parameters are loaded automatically if the user provided a JSON file in their submission. Submitting such a JSON file is completely optional, and this argument can be ignored if not needed. Returns: A ``tf.data.Dataset`` object that can be used to produce input tensors for your model. One tensor must correspond to one sequence of past imagery data. The tensors must be generated in the order given by ``target_sequences``. """ # Things to parse from the json config file image_size = config['image_size'] seq_len = config['seq_len'] timesteps = datetime.timedelta(minutes=config['timesteps_minutes']) scale_label = config['scale_label'] # Load dataframe dataframe = preprocessing.preprocess(dataframe, shuffle=False, scale_label=scale_label) metadata = data.Metadata(dataframe, scale_label) # Build dataloader data_loader = EvaluatorDataset(metadata, image_size=image_size, seq_len=seq_len, timesteps=timesteps, target_datetimes=target_datetimes, stations=stations, target_time_offsets=target_time_offsets) return data_loader
def train(): discount_factor = 0.99 num_episodes = 100000000 exploration_rate_begin = 1 exploration_rate_end = 0.1 exploration_rate = exploration_rate_begin exploration_decay = 100000 render = False render_freq = 30 steps_done = 0 replay_size_start = 10000 batch_size = 64 update_model_freq = 16 update_target_freq = 1000 lr = 0.00025 momentum = 0.95 # init objects buffer = Experience_buffer() env = gym.make('FlappyBird-v0') model = Model(env.action_space.n) # model.apply(Model.weights_init) target = Model(env.action_space.n) optimizer = optim.RMSprop(params=model.parameters(), lr=lr, momentum=momentum) loss = nn.SmoothL1Loss() agent = DQNAgent(env, model, target, optimizer, loss, update_target_freq) # let's play for i in range(num_episodes): # start a new episode print('Episode #{}'.format(i)) done = False episode_reward = 0 current_loss = 0 current_obs = env.reset() current_obs = preprocess(current_obs) # current_obs = Variable(torch.from_numpy(current_obs).unsqueezed(0), volatile=True) if render: env.render() while not done: action = agent.select_action(current_obs, exploration_rate) next_obs, reward, done, _ = env.step(action) next_obs = preprocess(next_obs) if render: env.render() buffer.add_experience(current_obs, action, reward, next_obs, done) # update data current_obs = next_obs episode_reward += reward steps_done += 1 if steps_done > replay_size_start: exploration_rate = exploration_rate_end + (exploration_rate_begin - exploration_rate_end) * math.exp(-1. * steps_done / exploration_decay) # if the buffer is filled enough, periodically update the model if len(buffer) > batch_size and steps_done % update_model_freq == 0 and steps_done > replay_size_start: print('INFO: agent updating...') batch = buffer.sample(batch_size) current_loss = agent.update(batch, i, discount_factor) if i % update_target_freq == 0: agent.update_target() if (i + 1 + render_freq) % render_freq == 0: render = True else: render = False print('Episode #{} reward:'.format(i), episode_reward) print('Current loss:'.format(i), current_loss) print()
# Model directory model_dir = args.result_dir + '/models/' + args.model + '/' + args.dataset + '/' + args.loss + '/BS_' + str( args.batch_size) + '/CI_' + str(args.critic_iter) + '/ND_' + str( args.noise_dim) + '/L_' + str(args.LAMBDA) # Load and prepare data if args.dataset.lower() == 'stress_strain': dataset = stress_strain.StressStrainDS() preproc = preprocessing.standardize elif args.dataset.lower() == 'mnist': dataset = mnist.MNISTDS() preproc = preprocessing.standardize_MNIST train_dataset = dataset.load_dataset() train_dataset, scaler = preprocessing.preprocess(train_dataset, args.batch_size, preproc) INPUT_SHAPE = tuple( tf.compat.v1.data.get_output_shapes(train_dataset).as_list()[1:]) # Instantiate Generator and Discriminator generator, discriminator = gans.get_models(args.model, args.loss, INPUT_SHAPE, args.noise_dim) print('\n\n######### GENERATOR #########\n') generator.summary() print('\n\n####### DISCRIMINATOR #######\n') discriminator.summary() # Optimizers if args.optimizer.lower() == 'adam':
def train(): ctx = [mx.cpu(0), mx.cpu(1), mx.cpu(2), mx.cpu(3)] discount_factor = 0.9 num_episodes = 10000 exploration_rate_begin = 0.9 exploration_rate_end = 0.05 exploration_rate = exploration_rate_begin exploration_decay = 200 render = False steps_done = 0 batch_size = 64 update_freq = 5 lr = 0.01 # init objects buffer = Experience_buffer() env = gym.make('FlappyBird-v0') model = Model(env.action_space.n) model.initialize(init=mx.initializer.Xavier(), ctx=ctx) optimizer = mx.optimizer.Adam(learning_rate=lr) trainer = gluon.Trainer(params=model.collect_params(), optimizer=optimizer) loss = gluon.loss.HuberLoss() agent = DQNAgent(env, model, trainer, loss) # let's play ! for i in range(num_episodes): # begin a new episode print('Episode #{}'.format(i)) done = False episode_reward = 0 current_loss = 0 current_obs = env.reset() current_obs = nd.array(preprocess(current_obs)) if render: env.render() while not done: action = agent.select_action(current_obs, exploration_rate) next_obs, reward, done, _ = env.step(action) next_obs = nd.array(preprocess(next_obs)) if render: env.render() buffer.add_experience(current_obs, action, reward, next_obs, done) if buffer.is_full(): print('INFO: buffer is full') # update information current_obs = next_obs episode_reward += reward steps_done += 1 exploration_rate = exploration_rate_end + ( exploration_rate_begin - exploration_rate_end) * math.exp( -1. * steps_done / exploration_decay) # if the buffer is filled enough, periodically update the model if len(buffer) > batch_size and update_freq % i == 0: print('INFO: agent updating...') batch = buffer.sample(batch_size) current_loss = agent.update(batch, batch_size, i, discount_factor) render = True else: render = False print('Episode #{} reward:'.format(i), episode_reward) print('Current loss:'.format(i), current_loss)
def __read_data__(self): scaler = StandardScaler() print('loading dataset...') # print(self.root_path) # print(self.data_name) data_path = Path(self.root_path)/self.data_name pickle_path = Path(self.root_path)/f"{self.data_name}.pandas.pickle" # print(data_path) if not pickle_path.exists(): with pickle_path.open('wb') as p_fd: df_raw = pd.read_csv(data_path) features = [c for c in df_raw.columns if 'feature' in c] print('preprocessing data...') df_raw = preprocess(df_raw, self.scale) pickle.dump(df_raw, p_fd) with pickle_path.open('rb') as p_fd: df_pickled = pickle.load(p_fd) # df_pickled.info() df_pickled = df_pickled[df_pickled.weight != 0] # df_pickled = df_pickled[df_pickled.date > 399] df_pickled = df_pickled[df_pickled.date > 85].reset_index(drop=True) print('generate target...') resp_cols = [c for c in df_pickled.columns if 'resp' in c] # df_pickled['action'] = ((df_pickled['resp'] > 0) & # (df_pickled['resp_1'] > 0) & # (df_pickled['resp_2'] > 0) & # (df_pickled['resp_3'] > 0) & # (df_pickled['resp_4'] > 0)).astype('int') # df_pickled['action'] = df_pickled['resp'].copy() df_pickled['action'] = df_pickled[resp_cols].sum(axis=1)/len(resp_cols) # df_pickled['action'] = df_pickled.apply(lambda row: row.weight * row.resp, axis='columns') # df_pickled['action_1'] = df_pickled['resp_1'] # df_pickled['action_2'] = df_pickled['resp_2'] # df_pickled['action_3'] = df_pickled['resp_3'] # df_pickled['action_4'] = df_pickled['resp_4'] # df_pickled.info() print("split train, valid...") split_date = 400 train_df = df_pickled.loc[df_pickled.date <= split_date].reset_index(drop=True) valid_df = df_pickled.loc[df_pickled.date > split_date].reset_index(drop=True) # print(train_df) # valid_df['weighted_resp'] = valid_df.apply(lambda row: row.weight * row.resp, axis='columns') # target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4'] # target_cols = ['action', 'action_1', 'action_2', 'action_3'] # target_cols = ['weighted_resp'] target_cols = ['action'] if self.scale: train_df[target_cols] = scaler.fit_transform(train_df[target_cols].values) valid_df[target_cols] = scaler.fit_transform(valid_df[target_cols].values) print('organize values...') features = [c for c in train_df.columns if 'feature' in c] if self.set_type == 0: self.weight = train_df.weight.values self.resp = train_df.resp.values self.data_x = train_df[features+target_cols].values self.data_y = train_df[features+target_cols].values self.data_stamp = train_df.date.values elif self.set_type == 1: self.weight = valid_df.weight.values self.resp = valid_df.resp.values self.data_x = valid_df[features+target_cols].values self.data_y = valid_df[features+target_cols].values self.data_stamp = valid_df.date.values
import numpy as np from sklearn.model_selection import train_test_split import utils.config as config from utils.metrics import show_metrics, show_conf_matrix, dtree_viz from utils.models import DecisionTree, RandomForest, KNeighbours, ArtificialNeuralNetwork, XGBoost, \ Results, VotingClassifier from utils.preprocessing import preprocess # Eliminate randomness np.random.seed(1337) random.seed(1337) # Take sequence file as a BoW of bytes and make a probability distribution from that X1, X2, y = preprocess() # Cast lists to numpy array X1 = np.array(X1) X2 = np.array(X2) y = np.array(y) # Split train and test data X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split( X1, X2, y, test_size=config.TEST_SIZE) # Make models and train models_probability = { 'dt': DecisionTree, 'rf': RandomForest, 'knn': KNeighbours,