def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname # Make temp output directory file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Load model_config self.model_config = file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)
def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() self.logs_dir: str = os.path.join(self.args.logs_dir, self.run_id) # Setup logging file_io.make_directory(self.logs_dir, clear_dir=True, log=None) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.info("CLI args: \n{}".format( json.dumps(vars(self.args)).replace(",", "\n"))) # Setup directories self.models_dir: str = os.path.join(self.args.models_dir, self.run_id) self.data_dir: str = self.args.data_dir file_io.make_directory(self.models_dir, clear_dir=False, log=self.logger) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key self.optimizer_key: str = self.args.optimizer_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = parse_config( tfrecord_type=self.tfrecord_type, feature_config=self.args.feature_config, logger=self.logger, ) self.logger.info("Feature config parsed and loaded") # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
def setup_logging(): run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) logs_dir: str = os.path.join('logs', run_id) file_io.make_directory(logs_dir, clear_dir=True, log=None) outfile: str = os.path.join(logs_dir, "output_log.csv") logger = logging_utils.setup_logging(reset=True, file_name=outfile, log_to_file=True) logger.info('Logging initialized. Saving logs to : {}'.format(logs_dir)) logger.info('Run ID: {}'.format(run_id)) return logger
def run_dataset_creation(data_dir: str = DATA_DIR, out_dir: str = OUT_DIR, feature_config: str = FEATURE_CONFIG, feature_highval: dict = FEATURE_HIGHVAL, feature_num_results: str = FEATURE_NUM_RESULTS, max_num_records: int = MAX_NUM_RECORDS, num_samples: int = NUM_SAMPLES, random_state: int = RANDOM_STATE): """ 1. Loads example data 2. Builds specified synthetic data size by sampling from example data 3. Adds catastrophic failures specifically 4. For now, write out to CSV. In future could return df directly """ # Setup logging logger: Logger = setup_logging() try: # Set seeds set_seeds(random_state) logger.info( 'Set seeds with initial random state {}'.format(random_state)) # Load and parse feature config feature_config: FeatureConfig = parse_config( tfrecord_type='', feature_config=feature_config, logger=logger) logger.info("Feature config parsed and loaded") # Create output location file_io.make_directory(out_dir, log=logger) out_file = os.path.join( out_dir, 'synthetic_data_{}.csv'.format( dt.datetime.now().strftime('%Y%m%d-%H%M%S'))) # Build data seed_data = load_seed_data(data_dir, logger) df_synthetic = fill_data(seed_data, max_num_records, feature_config, feature_highval, feature_num_results, num_samples, logger) file_io.write_df(df_synthetic, outfile=out_file, index=False) logger.info('Synthetic data created! Location: {}'.format(out_file)) return df_synthetic except Exception as e: logger.error("!!! Error creating synthetic data: !!!\n{}".format( str(e))) traceback.print_exc() return
def setUp(self, root_data_dir: str = ROOT_DATA_DIR, feature_config: str = FEATURE_CONFIG, output_dir: str = OUTPUT_DIR, log_dir: str = LOG_DIR): self.root_data_dir = root_data_dir self.feature_config = feature_config self.output_dir = output_dir self.log_dir = log_dir # Set up logging file_io.make_directory(self.log_dir, clear_dir=True) outfile: str = os.path.join(self.log_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, **kwargs) -> tf.data.TFRecordDataset: """ - reads csv-formatted data from an input directory - selects relevant features - creates Dataset X and y Current execution plan: 1. Load CSVs as pandas dataframes 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Args: - data_dir: Path to directory containing csv files to read - feature_config: ml4ir.config.features.FeatureConfig object extracted from the feature config - tfrecord_dir: Path to directory where the serialized .tfrecord files will be stored - batch_size: int value specifying the size of the batch - use_part_files: bool value specifying whether to look for part files - max_sequence_size: int value specifying max number of records per query - logger: logging object Returns: tensorflow TFRecordDataset """ csv_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".csv", prefix="part-" if use_part_files else "", ) # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) # Write tfrecord files tfrecord_writer.write_from_files( csv_files=csv_files, tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, logger=logger, ) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, logger=logger, ) return dataset