def process_and_save_multidomain_datasets(data_dict, dataset_name_pairs, validation_splits, output_root_dir): encoder = LabelEncoder() domain_names = list(validation_splits.keys()) for dataset_name_pair in dataset_name_pairs: encoder = LabelEncoder() experiment = '_'.join(dataset_name_pair) experiment_dir = os.path.join(output_root_dir, experiment) for i, dataset_name in enumerate(dataset_name_pair): domain = domain_names[i] if domain == 'source': merge_new_labels = True else: merge_new_labels = False process_and_save_dataset( data_dict[dataset_name], '_'.join([domain, dataset_name]), encoder=encoder, validation_splits=validation_splits[domain], experiment_dir=experiment_dir, merge_new_labels=merge_new_labels)
def process_and_save_multidataset_singledomain_datasets( data_dict: dict, dataset_names: list, validation_splits: dict, output_root_dir: str, merge_new_labels: bool = True): ''' Generate CSV datasets for single domain experiment, but for each pair of datasets to be merged Arguments: data_dict: dict dataset_names: list e.g. ['PNAS','Fossil','Leaves']. Pairs will be created within function. validation_splits: dict output_root_dir: str merge_new_labels: bool = True ''' for i, dataset_1 in enumerate(dataset_names): for j, dataset_2 in enumerate(dataset_names): if j == i: continue dataset_name = '+'.join([dataset_1, dataset_2]) encoder = LabelEncoder() experiment_dir = os.path.join(output_root_dir, dataset_name) input_data = pd.concat( [data_dict[dataset_1], data_dict[dataset_2]]) process_and_save_dataset(input_data, name=dataset_name, encoder=encoder, validation_splits=validation_splits, experiment_dir=experiment_dir, merge_new_labels=merge_new_labels, other_data_keys=['dataset'])
def main(experiment_config, experiment_results_dir): ############################################ #TODO: Moving towards defining most or all run parameters in separate config files ############################################ domain = experiment_config.domain label_mapping_filepath = experiment_config['label_mappings'] label_encoder = LabelEncoder(filepath=label_mapping_filepath) print(label_encoder) trainer = CSVTrainer(experiment_config, label_encoder=label_encoder) trainer.init_model_builder() model_filepath = os.path.join( trainer.model_manager.model_dir, trainer.model_name + '_' + domain + '_model.h5') train_data = trainer.get_data_loader(subset='train') val_data = trainer.get_data_loader(subset='val') test_data = trainer.get_data_loader(subset='test') #Get parameters for fitting and callbacks fit_params = trainer.get_fit_params() callbacks = get_callbacks(weights_best=os.path.join( trainer.model_manager.model_dir, trainer.model_name + '_' + domain + '_model_weights_best.h5'), logs_dir=os.path.join(experiment_results_dir, 'tensorboard_logs'), restore_best_weights=True) history = trainer.fit(train_data, steps_per_epoch=fit_params['steps_per_epoch'], epochs=fit_params['epochs'], validation_data=val_data, validation_steps=fit_params['validation_steps'], callbacks=callbacks) #, # history_name=domain # ) trainer.histories[domain] = history trainer.save_model(filepath=model_filepath) ####################################################################### # TARGET DOMAIN #trainer.load_model(filepath=source_model_filepath) num_test_samples = trainer.metadata_splits['test']['num_samples'] num_steps = num_test_samples // trainer.config['batch_size'] test_results = [ trainer.evaluate(test_data, steps=num_steps, log_name='test') ] #'trained-on-source_train--evaluated-on-source_test')] trainer.test_results = test_results return trainer
def init_params(self, label_encoder): # import pdb; pdb.set_trace() self.tfrecord_root_dir = self.config['tfrecord_root_dir'] self.model_dir = self.config['model_dir'] self.data_db_path = self.config['data_db_path'] self.db = None if label_encoder is None: self.encoder = LabelEncoder() else: self.encoder = label_encoder if 'label_encodings_filepath' in self.config: assert validate_filepath(self.config['label_encodings_filepath'], file_type='json') self.label_encodings_filepath = self.config[ 'label_encodings_filepath'] else: self.label_encodings_filepath = os.path.join( self.model_dir, f'{self.name}-label_encodings.json') self.config['label_encodings_filepath'] = self.label_encodings_filepath
def process_and_save_singledomain_datasets(data_dict: dict, dataset_names: list, validation_splits: dict, output_root_dir: str, merge_new_labels: bool = True): ''' Generate CSV datasets for single domain experiment, one for each individual dataset ''' for dataset_name in dataset_names: encoder = LabelEncoder() experiment_dir = os.path.join(output_root_dir, dataset_name) process_and_save_dataset(data_dict[dataset_name], name=dataset_name, encoder=encoder, validation_splits=validation_splits, experiment_dir=experiment_dir, merge_new_labels=merge_new_labels)
def main(experiment_configs, experiment_results_dir): ############################################ #TODO: Moving towards defining most or all run parameters in separate config files ############################################ label_encoders = {} for i, domain in enumerate(['source', 'target']): label_mapping_filepath = experiment_configs[i]['label_mappings'] label_encoders.update( {domain: LabelEncoder(filepath=label_mapping_filepath)}) print(domain, len(label_encoders[domain])) trainer = TransferTrainer(experiment_configs, trainer_constructor=CSVTrainer, label_encoders=label_encoders) trainer.init_model_builder(domain='source') source_model_filepath = os.path.join( trainer.model_manager.model_dir, trainer.model_name + '_source_model.h5') target_model_filepath = os.path.join( trainer.model_manager.model_dir, trainer.model_name + '_target_model.h5') source_train_data = trainer.get_data_loader(domain='source', subset='train') source_val_data = trainer.get_data_loader(domain='source', subset='val') #Get parameters for fitting and callbacks fit_params = trainer.get_fit_params(domain='source') callbacks = get_callbacks(weights_best=os.path.join( trainer.model_manager.model_dir, 'source_domain_weights_best.h5'), logs_dir=os.path.join(experiment_results_dir, 'tensorboard_logs'), restore_best_weights=True) # TRAIN ON SOURCE DOMAIN history = trainer.fit(source_train_data, steps_per_epoch=fit_params['steps_per_epoch'], epochs=fit_params['epochs'], validation_data=source_val_data, validation_steps=fit_params['validation_steps'], callbacks=callbacks, history_name='source') trainer.histories['source'] = history trainer.save_model(filepath=source_model_filepath) ####################################################################### # TARGET DOMAIN trainer.load_model(filepath=source_model_filepath) target_train_data = trainer.get_data_loader(domain='target', subset='train') target_val_data = trainer.get_data_loader(domain='target', subset='val') target_test_data = trainer.get_data_loader(domain='target', subset='test') fit_params = trainer.get_fit_params(domain='target') callbacks = get_callbacks(weights_best=os.path.join( trainer.model_manager.model_dir, 'target_domain_weights_best.h5'), logs_dir=os.path.join(experiment_results_dir, 'tensorboard_logs'), restore_best_weights=True) num_test_samples = trainer.domains['target'].metadata_splits['test'][ 'num_samples'] num_steps = num_test_samples // trainer.domains['target'].config[ 'batch_size'] test_results = [] test_results += [ trainer.evaluate(target_test_data, steps=num_steps, log_name='0-shot_test') ] # FINETUNE ON TARGET DOMAIN history = trainer.fit(target_train_data, steps_per_epoch=fit_params['steps_per_epoch'], epochs=fit_params['epochs'], validation_data=target_val_data, validation_steps=fit_params['validation_steps'], callbacks=callbacks, history_name='target') trainer.histories['target'] = history test_results += [ trainer.evaluate(target_test_data, steps=num_steps, log_name='test_acc') ] trainer.test_results = test_results return trainer
class SQLManager: ''' ETL pipeline for preparing data from Leavesdb SQLite database and staging TFRecords for feeding into data loaders. Meant to be subclassed for use with BaseTrainer and future Trainer classes. ''' def __init__(self, experiment_config, label_encoder=None): self.config = experiment_config self.configs = {'experiment_config': self.config} self.name = '' print('In SQLManager.__init__') self.init_params(label_encoder=label_encoder) def init_params(self, label_encoder): # import pdb; pdb.set_trace() self.tfrecord_root_dir = self.config['tfrecord_root_dir'] self.model_dir = self.config['model_dir'] self.data_db_path = self.config['data_db_path'] self.db = None if label_encoder is None: self.encoder = LabelEncoder() else: self.encoder = label_encoder if 'label_encodings_filepath' in self.config: assert validate_filepath(self.config['label_encodings_filepath'], file_type='json') self.label_encodings_filepath = self.config[ 'label_encodings_filepath'] else: self.label_encodings_filepath = os.path.join( self.model_dir, f'{self.name}-label_encodings.json') self.config['label_encodings_filepath'] = self.label_encodings_filepath def extract(self, dataset_names=''): ''' Query all filenames and labels associated with dataset_name Argmuents: dataset_names, list(str): list of individual dataset names to load into one dataframe Return: data, pd.DataFrame: DataFrame containing columns ['path','label','dataset'] ''' dataset_names = dataset_name.split('+') self.db_df = self.db_query(dataset_names=dataset_names) self.target_size = self.config.target_size self.num_channels = self.config.num_channels return self.db_df def transform(self, verbose=False): self.x, self.y = self.db_filter(self.db_df, verbose=verbose) self.data_splits, self.metadata_splits = self.split_data( self.x, self.y) self.num_classes = self.metadata_splits['train']['num_classes'] self.config.num_classes = self.num_classes self.label_encodings = self.encoder.get_encodings() return self.data_splits def load(self): self.dataset_builder = DatasetBuilder(root_dir=self.tfrecord_root_dir, num_classes=self.num_classes) self.coder, self.tfrecord_files = self.stage_tfrecords() return self.tfrecord_files # ''' # TODO: Refactor starting from db_query() to accept arbitrary number of datasets to be queried and concatenated together ### # ''' def open_db_connection(self): ''' Returns an open connection to db, starts it if it doesn't yet exist ''' if not self.db: self.local_db = leavesdb.init_local_db(src_db=self.data_db_path) self.db = dataset.connect(f'sqlite:///{self.local_db}', row_type=stuf) return self.db def load_data(self, db, datasets=['Fossil', 'Leaves'], x_col='path', y_col='family', keep_cols=['dataset']): data_df = pd.DataFrame(db['dataset'].all()) data = [] columns = [x_col, y_col, *keep_cols] for name in datasets: data += [data_df[data_df.loc[:, 'dataset'] == name]] data = pd.concat(data) data = data.loc[:, columns] return data def db_query(self, dataset_names=['Fossil'], label_col='family'): ''' Query all filenames and labels associated with dataset_name Argmuents: dataset_names, list(str): list of individual dataset names to load into one dataframe Return: data, pd.DataFrame: DataFrame containing columns ['path','label','dataset'] ''' db = self.open_db_connection() data = self.load_data(db, datasets=dataset_names, x_col='path', y_col=label_col) return data def db_filter(self, db_df, label_col='family', verbose=False): ''' Function to apply preprocessing to output of db_query, prior to conversion of images to TFRecord. ''' threshold = self.config.low_class_count_thresh db_df = filter_low_count_labels(db_df, threshold=threshold, verbose=verbose) if len(self.encoder) == 0: self.encoder.merge_labels(labels=list(db_df[label_col])) self.encoder.save_labels(self.config['label_encodings_filepath']) db_df = self.encoder.filter(db_df, label_col=label_col) self.x = db_df['path'].values.reshape((-1, 1)) self.y = np.array(self.encoder.transform(db_df[label_col])) return self.x, self.y def split_data(self, x, y, verbose=False): ''' Function to split data ino k-splits. Currently, default is to simply split into train/val/test sets ''' val_size = self.config.data_splits['val_size'] test_size = self.config.data_splits['test_size'] self.data_splits = train_val_test_split(x, y, val_size=val_size, test_size=test_size) self.metadata_splits = get_data_splits_metadata(self.data_splits, self.db_df, encoder=self.encoder, verbose=verbose) return self.data_splits, self.metadata_splits def get_class_counts(self): class_count_splits = {} for subset, subset_data in self.data_splits.items(): print(subset) if type(subset_data['path']) == np.ndarray: subset_data['path'] = subset_data['path'].flatten().tolist() labels, label_counts = get_class_counts( pd.DataFrame.from_dict(subset_data)) class_count_splits[subset] = { l: c for l, c in zip(labels, label_counts) } return class_count_splits def stage_tfrecords(self, verbose=False): ''' Looks for tfrecords corresponding to DatasetConfig parameters, if nonexistent then proceeds to create tfrecords. ''' self.root_dir = self.tfrecord_root_dir dataset_name = self.config.dataset_name val_size = self.config.data_splits['val_size'] test_size = self.config.data_splits['test_size'] #Store records in subdirectories labeled with relevant metadata record_subdirs = [ dataset_name, f'num_channels-3_thresh-{self.config.low_class_count_thresh}', f'val_size={val_size}-test_size={test_size}' ] tfrecords = self.dataset_builder.recursive_search( self.root_dir, subdirs=record_subdirs, verbose=verbose) if tfrecords is None: return create_tfrecords(self.config, record_subdirs, data_splits=self.data_splits, metadata_splits=self.metadata_splits) else: coder = TFRecordCoder(self.data_splits['train'], self.root_dir, record_subdirs=record_subdirs, target_size=self.target_size, num_channels=self.num_channels, num_classes=self.num_classes) return coder, tfrecords