def start(self, **kwargs): pct_train = kwargs['pct_train'] pct_test = kwargs['pct_test'] df, _ = pickle_service.concat_pickled_dataframes( config.MERGED_SWATCH_DATA_PATH) df['filename'] = df['path'].apply(lambda x: Path(x).name) df['video_name_stem'] = df['path'].apply( lambda x: Path(x).stem.split('_')[0]) df['gross_label'] = df['path'].apply( lambda x: 'real' if x.endswith('1.0.png') else 'fake') df['real_or_fake_digit'] = df['gross_label'].apply( lambda x: 1 if x == 'fake' else 0) df['score_2places'] = np.around(df['score'].values, decimals=2) df['score_3places'] = np.around(df['score'].values, decimals=3) df['score_4places'] = np.around(df['score'].values, decimals=4) video_names = df['video_name_stem'].unique() np.random.shuffle(video_names) num_rows = video_names.shape[0] train_num = int((num_rows * (pct_train / 100))) test_num = int((num_rows * (pct_test / 100))) val_num = num_rows - train_num - test_num vid_train = video_names[:train_num] vid_validation = video_names[train_num:train_num + val_num] logger.info( f'Will attempt to set rows for training: {vid_train.shape[0]}.') vid_train_list = list(vid_train) def split_train_test_val(value): t_label = 'test' if value in vid_train_list: t_label = 'train' elif value in vid_validation: t_label = 'validation' return t_label df['test_train_split'] = df['video_name_stem'].apply( split_train_test_val) train_rows = df[df['test_train_split'] == 'train'].shape[0] val_rows = df[df['test_train_split'] == 'validation'].shape[0] test_rows = df[df['test_train_split'] == 'test'].shape[0] logger.info(f'Train {train_rows} rows.') logger.info(f'Validation {val_rows} rows.') logger.info(f'Test {test_rows} rows.') logger.info(f'Head: {df.head()}') result = {'output_path': self.persist_output_dataframe(df)} self.validate_start_output(result) return result
def consolidate_pickles(output_path: Path): output_par_path = output_path.parent logger.info(f'Parent path: {output_par_path}') assert (output_par_path.exists()) df, _ = pickle_service.concat_pickled_dataframes(output_par_path) logger.info("About to pickle.") df.to_pickle(output_path)
def consolidate_persisted_dataframes(self): logger.info(f'Output path: {self.dataframes_path}') df, all_df_paths = pickle_service.concat_pickled_dataframes( self.dataframes_path) archive_path = file_service.archive_paths(all_df_paths, self.dataframes_path, 'archive', 'pkl') assert (archive_path.exists()) for f in all_df_paths: f.unlink() output_path = self.persist_output_dataframe(df) return output_path, archive_path
def get_decorated_df(pct_train: int = 80, pct_test: int = 5): df, _ = pickle_service.concat_pickled_dataframes(config.MERGED_SWATCH_DATA_PATH) df['video_name_stem'] = df['path'].apply(lambda x: Path(x).stem.split('_')[0]) df['gross_label'] = df['path'].apply(lambda x: 'real' if x.endswith('1.0.png') else 'fake') video_names = df['video_name_stem'].unique() np.random.shuffle(video_names) num_rows = video_names.shape[0] train_num = int((num_rows * (pct_train/100))) test_num = int((num_rows * (pct_test/100))) val_num = num_rows - train_num - test_num vid_train = video_names[:train_num] vid_validation = video_names[train_num:train_num + val_num] logger.info(f'Will attempt to set rows for training: {vid_train.shape[0]}.') vid_train_list = list(vid_train) def split_train_test_val(value): t_label = 'test' if value in vid_train_list: t_label = 'train' elif value in vid_validation: t_label = 'validation' return t_label df['test_train_split'] = df['video_name_stem'].apply(split_train_test_val) train_rows = df[df['test_train_split'] == 'train'].shape[0] val_rows = df[df['test_train_split'] == 'validation'].shape[0] test_rows = df[df['test_train_split'] == 'test'].shape[0] logger.info(f'Train {train_rows} rows.') logger.info(f'Validation {val_rows} rows.') logger.info(f'Test {test_rows} rows.') logger.info(f'Head: {df.head()}')
def read_existing_output_dataframe(self) -> pd.DataFrame: df, _ = pickle_service.concat_pickled_dataframes(self.dataframes_path) return df
def load_history(max_pickles=None): logger.info("About to load real swatch process history ...") df, _ = pickle_service.concat_pickled_dataframes(config.SSIM_REALS_DATA_OUTPUT_PATH, max_pickles) return df
def test_columns(self): df, _ = pickle_service.concat_pickled_dataframes( config.MERGED_SWATCH_DATA_PATH) logger.info(f'Cols: {df.columns}')