def main(): data = utils.from_json_file(os.path.join(JSON_DIR, "data.json")) input_data = utils.from_json_file(os.path.join(INPUT_DIR, "input.json")) good_image_path = data["images"][input_data["image_index"]]["path"] good_image = utils.get_image_by_path(good_image_path, config.FGFI_DIR) faces_to_replace = get_faces_to_replace(input_data, data) for i, face_to_replace in enumerate(faces_to_replace): #print(face_to_replace["face"]["image"].size) #print(face_to_replace["replacer"]["image"].size) #print(face_to_replace["face"]["nobg"].size) #print(face_to_replace["replacer"]["nobg"].size) width, height = face_to_replace["face"]["image"].size radius = math.floor(min(width, height) * 0.018) #print("width = {}, height = {}, radius = {}".format(width, height, radius)) face_data = data["images"][ face_to_replace["face"]["data"]["image_index"]]["faces"][ face_to_replace["face"]["data"]["face_index"]] good_image, inpainted_mask_image = inpaint( good_image, face_to_replace["face"]["mask"].convert("L"), face_data, radius) mask_image = face_to_replace["face"]["mask"] mask_image.save(os.path.join(RESULT_DIR, "mask_{}.png".format(i))) good_image.save(os.path.join(RESULT_DIR, "in_{}.png".format(i))) x, y = face_to_replace["pos_to_replace"]["x"], face_to_replace[ "pos_to_replace"]["y"] face_image_mask = creatMask.copy_face_to_image_v2( good_image, face_to_replace["replacer"]["nobg"], x, y, border=True).filter(ImageFilter.GaussianBlur(radius=1)) face_image = creatMask.copy_face_to_image_v2( good_image, face_to_replace["replacer"]["nobg"], x, y, False) face_image_mask.save(os.path.join(RESULT_DIR, "fm_{}.png".format(i))) face_image.save(os.path.join(RESULT_DIR, "f_{}.png".format(i))) good_image = Image.composite(face_image, good_image, face_image_mask.convert("L")) good_image.save(os.path.join(RESULT_DIR, "comp_{}.png".format(i))) good_image.show()
def main(): data = utils.from_json_file(os.path.join(JSON_DIR, "data.json")) input_data = utils.from_json_file(os.path.join(INPUT_DIR, "input.json")) good_image_path = data["images"][input_data["image_index"]]["path"] good_image = utils.get_image_by_path(good_image_path, config.FGFI_DIR) faces_to_replace = get_faces_to_replace(input_data, data) for i, face_to_replace in enumerate(faces_to_replace): inpainted_array, mask_array = inpaintTest.inpaint_image( good_image, face_to_replace["face"]["mask"]) inpainted_image = Image.fromarray( skimage.util.img_as_ubyte(inpainted_array)) mask_image = Image.fromarray(skimage.util.img_as_ubyte(mask_array)) mask_image.save(os.path.join(RESULT_DIR, "mask_{}.png".format(i))) inpainted_image = inpainted_image.resize(good_image.size) print(mask_image.size) print(good_image.size) print(inpainted_image.size) good_image = Image.composite(inpainted_image, good_image, mask_image.convert("L")) good_image.save(os.path.join(RESULT_DIR, "in_{}.png".format(i))) face_image_mask = creatMask.copy_face_to_image( good_image, face_to_replace["replacer"]["nobg"], data, face_to_replace["face"]["image_index"], face_to_replace["face"]["face_index"], border=True).filter(ImageFilter.GaussianBlur(radius=2)) face_image = creatMask.copy_face_to_image( good_image, face_to_replace["replacer"]["nobg"], data, face_to_replace["face"]["image_index"], face_to_replace["face"]["face_index"], False) face_image_mask.save(os.path.join(RESULT_DIR, "fm_{}.png".format(i))) face_image.save(os.path.join(RESULT_DIR, "f_{}.png".format(i))) good_image = Image.composite(face_image, good_image, face_image_mask.convert("L")) good_image.save(os.path.join(RESULT_DIR, "comp_{}.png".format(i)))
def hex(bounds_north, bounds_south, bounds_east, bounds_west, theradial): """ Hexagons specific functions to create hexagon mapping layer """ theshape = 'hex' t_mod = Tiles(shape=theshape, north=bounds_north, south=bounds_south, east=bounds_east, west=bounds_west, radial=theradial) #u_mod = Util(shape=theshape, radial=theradial) datasets = from_json_file('datasets.json', t_mod.json_files_path) ref_data = datasets['DataSets']['Australia']['ShapeFormat'] file_deploy(ref_data) ref_data = datasets['DataSets']['AGILDataset']['CSVFormat'] file_deploy(ref_data) ref_data = datasets['DataSets']['MBSP']['CSVFormat'] file_deploy(ref_data) ref_data = datasets['DataSets']['NASAActiveFireData']['ModisC61km'][ 'CSVFormat'] file_deploy(ref_data) aus_hex_array = t_mod.hexagons() nb_aus_hex_array = add_poly_nb(aus_hex_array, "p") f_path_shp = os.path.join( t_mod.shape_files_path, 'aus_{}_{}km_layer'.format(theshape, str(int(theradial)))) f_path_kml = os.path.join( t_mod.kml_files_path, 'aus_{}_{}km_layer'.format(theshape, str(int(theradial)))) f_path_geojson = os.path.join( t_mod.geojson_files_path, 'aus_{}_{}km_layer'.format(theshape, str(int(theradial)))) to_shp_file(nb_aus_hex_array, f_path_shp) to_kml_file(nb_aus_hex_array, f_path_kml, 'Active_Fires') to_geojson_file(nb_aus_hex_array, f_path_geojson)
def main(argv): if len(argv) > 1: raise RuntimeError(argv) absl_logging.use_python_logging() utils.log_module_args(LOGGER, argv[0]) retriever_config = tf_utils.REALMSave( **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value)) assert not _FLAG_USE_SUBSET.value time_stamp = time.strftime("%Y%m%d-%H%M%S") target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip()) if target_path[-1] != "/": target_path += "/" ############################################################################## # Setup devices and strategy ############################################################################## with utils.log_duration(LOGGER, "main", "Initializing devices"): tpu_config = tf_utils.init_tpus() device_type = tf_utils.current_accelerator_type() LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use())) if device_type == "TPU": if tpu_config is None: raise RuntimeError("We should have a tpu_config.") strategy = tf.distribute.TPUStrategy(tpu_config.resolver) batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value elif device_type == "GPU" or device_type == "CPU": strategy = tf.distribute.MirroredStrategy() batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value else: raise RuntimeError(device_type) ############################################################################## # Load the dataset. ############################################################################## eli5 = {} keys = ["train", "eval", "test"] gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."): for split in tqdm.tqdm(keys): load_path = os.path.join(_FLAG_DATASET_ROOT.value, "HuggingfaceDatasets", f"{split}_kilt_eli5.hf") with tf.device("/job:localhost"): eli5[split] = datasets.load_from_disk(load_path) ############################################################################## # ############################################################################## with utils.log_duration(LOGGER, "Main", "Load the textual dataset"): # Extract the appropriate text # The buffer_size is taken from the original ORQA code. blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records, buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch( retriever_config.num_block_records, drop_remainder=True) blocks = tf.data.experimental.get_single_element(blocks_dataset) ############################################################################ # Prepare the output file. ############################################################################ writers = {} all_paths = {} for split in keys: maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else "" paths = [ os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr") for i in range(_FLAG_NUM_SHARDS.value) ] all_paths[split] = paths writers[split] = [tf.io.TFRecordWriter(filename) for filename in paths] with utils.log_duration(LOGGER, "main", "Loading the reference db."): checkpoint_path = os.path.join( retriever_config.query_embedder_path, "encoded", "encoded.ckpt") reference_db_device = tf_utils.device_mapping().CPUs[0].name with tf.device(reference_db_device): reference_db = tf_utils.load_reference_db( checkpoint_path, variable_name="block_emb", ) ############################################################################ # Prep the encoder and the tokenizer ############################################################################ with utils.log_duration(LOGGER, "main", "Loading the encoder model and the tokenizer."): with strategy.scope(): query_encoder = hub.load(retriever_config.query_embedder_path, tags={}) encode_fn = _make_encode_fn(query_encoder) encode_fn_strategy_run = make_encode_fn_strategy_run_fn( strategy=strategy, encode_fn=encode_fn, ) vocab_file = os.path.join(retriever_config.query_embedder_path, "assets", "vocab.txt") utils.check_exists(vocab_file) do_lower_case = query_encoder.signatures["tokenization_info"]( )["do_lower_case"] tokenization_info = dict(vocab_file=vocab_file, do_lower_case=do_lower_case) tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( query_encoder, tokenization_info) ############################################################################ # Preprocess the dataset ############################################################################ cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")), tf.int32) sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")), tf.int32) transform = _make_transform_fn( bert_tokenizer=tokenizer, bert_cls_token_id=cls_token_id, bert_sep_token_id=sep_token_id, ) feature_dtypes = { constants.CTH5Fields.distances: tf.float32, constants.CTH5Fields.gpt2_retrieved_ids: tf.int32, constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32, constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32, } with utils.log_duration(LOGGER, "main", "generating codes"): for split in keys: sample_count = 0 eli5: Dict[str, datasets.Dataset] if split != "test": for_slices = dict(sample_id=eli5[split]["id"], question=eli5[split]["input"], answer=[ sample["answer"][0] for sample in eli5[split]["output"] ]) else: for_slices = dict( sample_id=eli5[split]["id"], question=eli5[split]["input"], ) ds = tf.data.Dataset.from_tensor_slices(for_slices) ds = ds.map(transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) ds = ds.map(_squeeze, num_parallel_calls=tf.data.experimental.AUTOTUNE) tqdm_inner = tqdm.tqdm(enumerate(ds), total=len(eli5[split]["id"]) // _FLAG_BATCH_SIZE.value, desc=f"Split `{split}`: Batches") for i, batch in tqdm_inner: features = collections.defaultdict(list) ###################################################################### # Enforce the current real batch size ###################################################################### current_batch_size = batch["sample_id"].shape[0] for k, v in batch.items(): utils.check_equal(v.shape[0], current_batch_size) ###################################################################### gpt2_question_ids_inputs = _prep_field(batch["question"], gpt2_tokenizer) utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_question_ids_inputs.shape[0], current_batch_size) if split != "test": gpt2_answer_ids_inputs = _prep_field( batch["answer"], gpt2_tokenizer) utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_answer_ids_inputs.shape[0], current_batch_size) assert len(gpt2_answer_ids_inputs.shape) == 2, ( gpt2_answer_ids_inputs.shape) ###################################################################### # Save the gpt2 tokenized question and answer ###################################################################### features[constants.CTH5Fields.gpt2_question_ids_inputs].extend( gpt2_question_ids_inputs) if split != "test": features[ constants.CTH5Fields.gpt2_answer_ids_inputs].extend( gpt2_answer_ids_inputs) ###################################################################### # Encode the samples. ###################################################################### batch = strategy.experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch)) embeddings = encode_fn_strategy_run(batch) embeddings = tf_utils.process_strat_output( embeddings, "embeddings", strategy, current_batch_size) utils.check_isinstance(embeddings, ops.EagerTensor) utils.check_equal(embeddings.shape[0], current_batch_size) # pytype doesn't seem to see that we check the type utils.check_equal(embeddings.shape[1], _FLAG_EMBEDDING_DEPTH.value) # pytype: disable=attribute-error ###################################################################### # Retrieve. ###################################################################### with tf.device(reference_db_device): top_k, inner_prods = tf_utils.mips_exact_search( embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db) top_k = tf_utils.process_strat_output(top_k, "top_k", strategy, current_batch_size) utils.check_equal( inner_prods.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) utils.check_equal( top_k.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) features[constants.CTH5Fields.distances].extend(inner_prods) gathered = tf.gather(blocks, top_k).numpy() utils.check_equal(gathered.shape[0], current_batch_size) retrievals = [] for j in range(gathered.shape[0]): local_gathered = gathered[j].tolist() utils.check_equal(len(local_gathered), _FLAG_NUM_RETRIEVALS.value) local_gathered = [ sample.decode() for sample in local_gathered ] token_ids = np.array( gpt2_tokenizer.batch_encode_plus( local_gathered, padding="max_length", truncation=True, ).input_ids) for line in token_ids: assert not np.all(line == 0), line token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1 retrievals.append(token_ids) features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals utils.check_equal( retrievals[0].shape, (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value)) for k, v in features.items(): utils.check_equal(len(v), current_batch_size) for k in range(current_batch_size): feature = tf.train.Features( feature={ k: _bytes_feature( tf.io.serialize_tensor( tf.cast(v[k], feature_dtypes[k]))) for k, v in features.items() }) writers[split][ sample_count % _FLAG_NUM_SHARDS.value].write( tf.train.Example( features=feature).SerializeToString()) sample_count += 1 if sample_count % 1000 == 0: LOGGER.debug("Paths: %s", str(all_paths[split][0])) LOGGER.debug("Done.")
outline="#000000") img2.paste(without_bg_img, top_left) return img2 if __name__ == "__main__": OUTPUT_DIR = os.path.join(config.IHS_DIR, "output") DEBUG_DIR = os.path.join(config.IHS_DIR, "debug") FGFI_OUTPUT__DIR = os.path.join(config.FGFI_DIR, "debug") if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) nobg = utils.from_json_file(os.path.join(DEBUG_DIR, "nobg.json")) #get path for original image image_index = nobg[1]["info"]["image_index"] face_index = nobg[1]["info"]["face_index"] data = utils.from_json_file(os.path.join(FGFI_OUTPUT__DIR, "data.json")) path_for_image = data["images"][image_index]["path"] img = utils.get_image_by_path(path_for_image, config.FGFI_DIR) img = img.convert("RGBA") width, height = img.size ##get image after removed bg nobg = utils.from_json_file(os.path.join(DEBUG_DIR, "nobg.json")) image_from = utils.base64_image_to_image(nobg[1]["image_data"]) creat_mask(img, image_from, data)
def main(argv): # Arguments and logging boilerplate if len(argv) > 1: raise RuntimeError(argv) absl_logging.use_python_logging() utils.log_module_args(LOGGER, argv[0]) # Load a retriever config. retriever_config = tf_utils.REALMConfig( **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value)) assert not _FLAG_USE_SUBSET.value # Preparation of the output path time_stamp = time.strftime("%Y%m%d-%H%M%S") target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip()) if target_path[-1] != "/": target_path += "/" ############################################################################## # Setup devices and strategy ############################################################################## # Duration is pretty much instantaneous with utils.log_duration(LOGGER, "main", "Initializing devices"): tpu_config = tf_utils.init_tpus(local=_FLAG_TPU_IS_LOCAL.value, tpu_name=_FLAG_TPU_NAME.value) device_type = tf_utils.current_accelerator_type() LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use())) if _FLAG_TPU_NAME.value and device_type == "CPU": raise RuntimeError("Device is CPU and we expected a TPU.") if device_type == "TPU": if tpu_config is None: raise RuntimeError("We should have a tpu_config.") strategy = tf.distribute.TPUStrategy(tpu_config.resolver) batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value elif device_type == "GPU" or device_type == "CPU": strategy = tf.distribute.MirroredStrategy() batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value else: raise RuntimeError(device_type) ############################################################################## # Load the KILT ELI5 dataset. ############################################################################## # Takes a while eli5 = {} keys = ["train", "validation", "test"] gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."): if _FLAG_DATASET_ROOT.value: for split in tqdm.tqdm(keys): load_path = os.path.join(_FLAG_DATASET_ROOT.value, "HuggingfaceDatasets", f"{split}_kilt_eli5.hf") with tf.device("/job:localhost"): eli5[split] = datasets.load_from_disk(load_path) else: eli5 = datasets.load_dataset("kilt_tasks", "eli5") ############################################################################## # Load the dataset of the text that will be retrieved. ############################################################################## # Takes a long time with utils.log_duration(LOGGER, "Main", "Load the textual dataset"): # Extract the appropriate text # The buffer_size is taken from the original ORQA code. blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records, buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch( retriever_config.num_block_records, drop_remainder=False) blocks: tf.Tensor = tf.data.experimental.get_single_element( blocks_dataset) ############################################################################ # Increase the number of maximum open file descriptors to make space # for all the shards. ############################################################################ max_num_fd = _FLAG_NUM_SHARDS.value * 3 + _MIN_N_FD resource.setrlimit(resource.RLIMIT_NOFILE, (max_num_fd, max_num_fd)) ############################################################################ # Prepare the output files. ############################################################################ writers = {} all_paths = {} for split in keys: maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else "" # Prepare paths. They can't be in a generator. A function generator would be # fine though. paths = [ os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr") for i in range(_FLAG_NUM_SHARDS.value) ] all_paths[split] = paths writers[split] = [] # Create The TFR writers. for i, path in enumerate(paths): writers[split].append(tf.io.TFRecordWriter(path)) # Load the reference DB. We used to accidentally do this once per split :O with utils.log_duration(LOGGER, "main", "Loading the reference db."): checkpoint_path = os.path.join(retriever_config.query_embedder_path, "encoded", "encoded.ckpt") reference_db_device = tf_utils.device_mapping().CPUs[0].name with tf.device(reference_db_device): reference_db = tf_utils.load_reference_db( checkpoint_path, variable_name="block_emb", ) ############################################################################ # Prep the encoder and the tokenizer ############################################################################ with utils.log_duration(LOGGER, "main", "Loading the encoder model and the tokenizer."): with strategy.scope(): query_encoder = hub.load(retriever_config.query_embedder_path, tags={}) encode_fn = _make_encode_fn(query_encoder) encode_fn_strategy_run = make_encode_fn_strategy_run_fn( strategy=strategy, encode_fn=encode_fn, ) vocab_file = os.path.join(retriever_config.query_embedder_path, "assets", "vocab.txt") utils.check_exists(vocab_file) do_lower_case = query_encoder.signatures["tokenization_info"]( )["do_lower_case"] tokenization_info = dict(vocab_file=vocab_file, do_lower_case=do_lower_case) tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( query_encoder, tokenization_info) ############################################################################ # Preprocess the dataset ############################################################################ cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")), tf.int32) sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")), tf.int32) transform = _make_transform_fn( bert_tokenizer=tokenizer, bert_cls_token_id=cls_token_id, bert_sep_token_id=sep_token_id, ) feature_dtypes = { constants.CTH5Fields.distances: tf.float32, constants.CTH5Fields.gpt2_retrieved_ids: tf.int32, constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32, constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32, } with utils.log_duration(LOGGER, "main", "generating codes"): for split in keys: sample_count = 0 eli5: Dict[str, datasets.Dataset] if split != "test": for_slices = dict(sample_id=eli5[split]["id"], question=eli5[split]["input"], answer=[ sample[0]["answer"] for sample in eli5[split]["output"] ]) else: for_slices = dict( sample_id=eli5[split]["id"], question=eli5[split]["input"], ) ds = tf.data.Dataset.from_tensor_slices(for_slices) ds = ds.map(transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) ds = ds.map(_squeeze, num_parallel_calls=tf.data.experimental.AUTOTUNE) tqdm_inner = tqdm.tqdm(enumerate(ds), total=len(eli5[split]["id"]) // _FLAG_BATCH_SIZE.value, desc=f"Split `{split}`: Batches") for i, batch in tqdm_inner: features = collections.defaultdict(list) ###################################################################### # Enforce the current real batch size ###################################################################### current_batch_size = batch["sample_id"].shape[0] for k, v in batch.items(): utils.check_equal(v.shape[0], current_batch_size) ###################################################################### gpt2_question_ids_inputs = _prep_field(batch["question"], gpt2_tokenizer) utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_question_ids_inputs.shape[0], current_batch_size) if split != "test": gpt2_answer_ids_inputs = _prep_field( batch["answer"], gpt2_tokenizer) utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_answer_ids_inputs.shape[0], current_batch_size) assert len(gpt2_answer_ids_inputs.shape) == 2, ( gpt2_answer_ids_inputs.shape) ###################################################################### # Save the gpt2 tokenized question and answer ###################################################################### features[constants.CTH5Fields.gpt2_question_ids_inputs].extend( gpt2_question_ids_inputs) if split != "test": features[ constants.CTH5Fields.gpt2_answer_ids_inputs].extend( gpt2_answer_ids_inputs) ###################################################################### # Encode the samples. ###################################################################### batch = strategy.experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch)) embeddings = encode_fn_strategy_run(batch) embeddings = tf_utils.process_strat_output( embeddings, "embeddings", strategy, current_batch_size) utils.check_isinstance(embeddings, ops.EagerTensor) utils.check_equal(embeddings.shape[0], current_batch_size) # pytype doesn't seem to see that we check the type utils.check_equal(embeddings.shape[1], _FLAG_EMBEDDING_DEPTH.value) # pytype: disable=attribute-error ###################################################################### # Retrieve. ###################################################################### # Do exact retrieval with tf.device(reference_db_device): top_k, inner_prods = tf_utils.mips_exact_search( embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db) # Collate the results top_k = tf_utils.process_strat_output(top_k, "top_k", strategy, current_batch_size) # Check the shapes utils.check_equal( inner_prods.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) utils.check_equal( top_k.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) # Save the distances features[constants.CTH5Fields.distances].extend(inner_prods) # Retrieve the text fields associated to the indices gathered = tf.gather(blocks, top_k).numpy() utils.check_equal(gathered.shape[0], current_batch_size) utils.check_equal(gathered.shape[1], _FLAG_NUM_RETRIEVALS.value) retrievals = [] for index_in_batch in range(current_batch_size): # Put the appropriate byte strings in a list local_gathered = gathered[index_in_batch].tolist() utils.check_equal(len(local_gathered), _FLAG_NUM_RETRIEVALS.value) # Decode to utf-8 local_gathered = [ sample.decode() for sample in local_gathered ] # Encode to GPT2 BPE token_ids = np.array( gpt2_tokenizer.batch_encode_plus( local_gathered, padding="max_length", truncation=True, ).input_ids) # Make sure no line is empty # TODO(julesgm): Maybe optional for line in token_ids: assert not np.all(line == 0), line # Convert the eos_tokens token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1 # Save the retrievals retrievals.append(token_ids) # Save the feature features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals utils.check_equal( retrievals[0].shape, (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value)) for k, v in features.items(): utils.check_equal(len(v), current_batch_size) for index_in_batch in range(current_batch_size): feature_dict = {} for feature_k, feature_v in features.items(): # Cast the feature to its appropriate dtype casted_feats = tf.cast(feature_v[index_in_batch], feature_dtypes[feature_k]) # Serialize the tensor to bytes feature_bytes = tf.io.serialize_tensor(casted_feats) # Build a bytes list tf.train.Feature object, # the serialization tree node feature_dict[feature_k] = _bytes_feature(feature_bytes) # Create the serialization tree root # Expects a list of features feature = tf.train.Features(feature=feature_dict) # Expects a tf.train.Features object example_obj = tf.train.Example(features=feature) # Serialize that to bytes serialized_example = example_obj.SerializeToString() # Write the bytes # TODO(julesgm): Parallelize this with a thread or a process pool & # futures. writers[split][sample_count % _FLAG_NUM_SHARDS.value].write( serialized_example) sample_count += 1 if sample_count % 1000 == 0: LOGGER.debug("Paths: %s", str(all_paths[split][0])) LOGGER.debug("Flushing and closing the `%s` writers", split) for writer in tqdm.tqdm(writers[split]): writer.flush() writer.close() LOGGER.debug("Done.")
def main(argv): if len(argv) > 1: raise RuntimeError(argv) absl_logging.use_python_logging() retriever_config = tf_utils.REALMSave( **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value)) extra = "_FROM_SUBSET" if _FLAG_USE_SUBSET.value else "" time_stamp = time.strftime("%Y%m%d-%H%M%S") target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp + extra).strip() if target_path[-1] != "/": target_path += "/" ############################################################################## # Setup devices and strategy ############################################################################## with utils.log_duration(LOGGER, "main", "Initializing devices"): tpu_config = tf_utils.init_tpus() device_type = tf_utils.current_accelerator_type() LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use())) if device_type == "TPU": if tpu_config is None: raise RuntimeError("We should have a tpu_config.") strategy = tf.distribute.TPUStrategy(tpu_config.resolver) batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value elif device_type == "GPU" or device_type == "CPU": strategy = tf.distribute.MirroredStrategy() batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value else: raise RuntimeError(device_type) ############################################################################## # Load the dataset. ############################################################################## eli5 = {} keys = ["train", "eval", "test"] gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."): for split in tqdm.tqdm(keys): load_path = os.path.join(_FLAGS_DATASET_ROOT.value, "HuggingfaceDatasets", f"{split}_kilt_eli5.hf") with tf.device("/job:localhost"): eli5[split] = datasets.load_from_disk(load_path) if _FLAG_USE_SUBSET.value: _warn_subset() ############################################################################## # ############################################################################## with utils.log_duration(LOGGER, "Main", "Load the textual dataset"): # Extract the appropriate text # The buffer_size is taken from the original ORQA code. blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records, buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch( retriever_config.num_block_records, drop_remainder=True) blocks = tf.data.experimental.get_single_element(blocks_dataset) with tempfile.TemporaryDirectory() as tmp_dir: ############################################################################ # Prepare the output file. ############################################################################ tmp_dir = pathlib.Path(tmp_dir) h5_output_path = tmp_dir / "codes.h5" output_file = h5py.File(h5_output_path, "w") flags_dict = { flag.name: flag.value for flag in flags.FLAGS.flags_by_module_dict()[argv[0]] } utils.to_json_file(tmp_dir / "params.json", flags_dict) for split in keys: with utils.log_duration( LOGGER, "main", "Creating the output hdf5 file, embeddings."): num_entries = len(eli5[split]["id"]) if _FLAG_USE_SUBSET.value: num_entries = min(num_entries, _FLAG_SUBSET_AMOUNT.value) split_group = output_file.create_group(split) with utils.log_duration( LOGGER, "main", "Creating the output hdf5 file, retrieval."): split_group.create_dataset( constants.CTH5Fields.distances, shape=(num_entries, _FLAG_NUM_RETRIEVALS.value), dtype=np.float32, ) split_group.create_dataset( constants.CTH5Fields.gpt2_question_ids_inputs, shape=(num_entries, _FLAG_CONTEXT_SIZE.value), dtype=np.int32) if split != "test": split_group.create_dataset( constants.CTH5Fields.gpt2_answer_ids_inputs, shape=(num_entries, _FLAG_CONTEXT_SIZE.value), dtype=np.int32) split_group.create_dataset( constants.CTH5Fields.gpt2_retrieved_ids, shape=( num_entries, _FLAG_NUM_RETRIEVALS.value, _FLAG_MAX_LENGTH_RETRIEVALS.value, ), dtype=np.int32) with utils.log_duration(LOGGER, "main", "Loading the reference db."): checkpoint_path = os.path.join( retriever_config.query_embedder_path, "encoded", "encoded.ckpt") reference_db_device = tf_utils.device_mapping().CPUs[0].name with tf.device(reference_db_device): reference_db = tf_utils.load_reference_db( checkpoint_path, variable_name="block_emb", ) ############################################################################ # Prep the encoder and the tokenizer ############################################################################ with utils.log_duration( LOGGER, "main", "Loading the encoder model and the tokenizer."): with strategy.scope(): query_encoder = hub.load(retriever_config.query_embedder_path, tags={}) encode_fn = _make_encode_fn(query_encoder) encode_fn_strategy_run = _make_encode_fn_strategy_run_fn( strategy=strategy, encode_fn=encode_fn, ) vocab_file = os.path.join(retriever_config.query_embedder_path, "assets", "vocab.txt") utils.check_exists(vocab_file) do_lower_case = query_encoder.signatures["tokenization_info"]( )["do_lower_case"] tokenization_info = dict(vocab_file=vocab_file, do_lower_case=do_lower_case) tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( query_encoder, tokenization_info) ############################################################################ # Preprocess the dataset ############################################################################ cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")), tf.int32) sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")), tf.int32) transform = _make_transform_fn( bert_tokenizer=tokenizer, bert_cls_token_id=cls_token_id, bert_sep_token_id=sep_token_id, ) with utils.log_duration(LOGGER, "main", "generating codes"): tqdm_splits = tqdm.tqdm(keys) for split in tqdm_splits: tqdm_splits.set_description(f"Split `{split}`") eli5: Dict[str, datasets.Dataset] write_start = 0 if _FLAG_USE_SUBSET.value: _warn_subset(tqdm_splits) eli5[split] = eli5[split][:_FLAG_SUBSET_AMOUNT.value] utils.check_operator(operator.le, len(eli5[split]["id"]), _FLAG_SUBSET_AMOUNT.value) utils.check_operator(operator.le, len(eli5[split]["input"]), _FLAG_SUBSET_AMOUNT.value) else: utils.check_equal(len(eli5[split]), len(eli5[split]["id"])) utils.check_equal(len(eli5[split]), len(eli5[split]["input"])) if split != "test": for_slices = dict(sample_id=eli5[split]["id"], question=eli5[split]["input"], answer=[ sample["answer"][0] for sample in eli5[split]["output"] ]) else: for_slices = dict( sample_id=eli5[split]["id"], question=eli5[split]["input"], ) ds = tf.data.Dataset.from_tensor_slices(for_slices) ds = ds.map(transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) ds = ds.map(_squeeze, num_parallel_calls=tf.data.experimental.AUTOTUNE) tqdm_inner = tqdm.tqdm(enumerate(ds), total=len(eli5[split]["id"]) // _FLAG_BATCH_SIZE.value, desc=f"Split `{split}`: Batches") for i, batch in tqdm_inner: ###################################################################### # Enforce the current real batch size ###################################################################### current_batch_size = batch["sample_id"].shape[0] for k, v in batch.items(): utils.check_equal(v.shape[0], current_batch_size) ###################################################################### gpt2_question_ids_inputs = _prep_field( batch["question"], gpt2_tokenizer) utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_question_ids_inputs.shape[0], current_batch_size) if split != "test": gpt2_answer_ids_inputs = _prep_field( batch["answer"], gpt2_tokenizer) utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_answer_ids_inputs.shape[0], current_batch_size) assert len(gpt2_answer_ids_inputs.shape) == 2, ( gpt2_answer_ids_inputs.shape) ###################################################################### # Save the gpt2 tokenized question and answer ###################################################################### end = write_start + current_batch_size utils.check_equal( output_file[split][ constants.CTH5Fields.gpt2_question_ids_inputs] [write_start:end].shape[0], current_batch_size) output_file[split][ constants.CTH5Fields.gpt2_question_ids_inputs][ write_start:end] = gpt2_question_ids_inputs if split != "test": output_file[split][ constants.CTH5Fields.gpt2_answer_ids_inputs][ write_start:end] = gpt2_answer_ids_inputs ###################################################################### # Encode the samples. ###################################################################### batch = strategy.experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch)) embeddings = encode_fn_strategy_run(batch) embeddings = tf_utils.process_strat_output( embeddings, "embeddings", strategy, current_batch_size) utils.check_isinstance(embeddings, ops.EagerTensor) utils.check_equal(embeddings.shape[0], current_batch_size) # pytype doesn't seem to see that we check the type utils.check_equal(embeddings.shape[1], _FLAG_EMBEDDING_DEPTH.value) # pytype: disable=attribute-error ###################################################################### # Retrieve. ###################################################################### with tf.device(reference_db_device): top_k, inner_prods = tf_utils.mips_exact_search( embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db) top_k = tf_utils.process_strat_output( top_k, "top_k", strategy, current_batch_size) utils.check_equal( inner_prods.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) utils.check_equal( top_k.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) output_file[split]["distances"][ write_start:end] = inner_prods gathered = tf.gather(blocks, top_k).numpy() utils.check_equal(gathered.shape[0], current_batch_size) utils.check_equal(write_start + gathered.shape[0], end) for j in range(gathered.shape[0]): local_gathered = gathered[j].tolist() utils.check_equal(len(local_gathered), _FLAG_NUM_RETRIEVALS.value) local_gathered = [ sample.decode() for sample in local_gathered ] token_ids = np.array( gpt2_tokenizer.batch_encode_plus( local_gathered, padding="max_length", truncation=True, ).input_ids) for line in token_ids: assert not np.all(line == 0), line token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1 output_file[split][ constants.CTH5Fields.gpt2_retrieved_ids][ write_start + j] = token_ids[:, :_FLAG_MAX_LENGTH_RETRIEVALS. value] write_start += current_batch_size ############################################################################ # Upload the results to GCS ############################################################################ LOGGER.debug("DONE WITH THE PRODUCTION") output_file.close() with utils.log_duration(LOGGER, "main", "gsutil transfer"): command = [ "/root/google-cloud-sdk/bin/gsutil", "-m", "cp", "-r", str(tmp_dir / "*"), target_path ] LOGGER.debug("Command: %s", " ".join(command)) subprocess.check_call(command) LOGGER.debug("ALL DONE")
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() utils.log_module_args(LOGGER, argv[0]) utils.check_exists(FLAGS.scann_config_path) utils.check_glob_prefix(FLAGS.embeddings_ckpt_path) utils.check_exists(FLAGS.output_dir) if not tf.io.gfile.isdir(FLAGS.output_dir): raise RuntimeError("Output dir needs to be a directory.") ############################################################################## # Setup: Build the ScaNN (Scam) searcher ############################################################################## with utils.log_duration(LOGGER, "main", "load_scann_searcher"): checkpoint_path = os.path.join(FLAGS.embeddings_ckpt_path) # The conversion to a ScannConfig object enforces that all the fields we # expect are present in the json file. scann_config = retrievers.ScannConfig( **utils.from_json_file(FLAGS.scann_config_path)) block_emb, scann_searcher = scann_utils.load_scann_searcher( var_name="block_emb", checkpoint_path=checkpoint_path, **vars(scann_config)) utils.check_operator(operator.ge, block_emb.shape[0], FLAGS.test_how_many) ############################################################################## # Recall Computation ############################################################################## LOGGER.debug(block_emb.shape) utils.check_operator(operator.ge, block_emb.shape[0], FLAGS.test_how_many) with utils.log_duration(LOGGER, "main", "all retrievals & comparisons"): LOGGER.debug("block_emb.shape: %s", str(block_emb.shape)) LOGGER.debug("FLAGS.test_how_many: %d", FLAGS.test_how_many) all_indices = np.random.choice(block_emb.shape[0], FLAGS.test_how_many, replace=False) count_total = 0 count_good = 0 for i, idx_start in tqdm.tqdm( enumerate(range(0, len(all_indices), FLAGS.batch_size))): indices = all_indices[idx_start:idx_start + FLAGS.batch_size] vectors = tf.gather(block_emb, indices) if FLAGS.mode == "all": with utils.log_duration(LOGGER, "main", "exact_search"): labels = exact_search(FLAGS.num_neighbors, vectors, block_emb) elif FLAGS.mode == "any": labels = tf.cast(tf.expand_dims(indices, -1), tf.int32) else: raise RuntimeError(FLAGS.mode) with utils.log_duration(LOGGER, "main", "scann_search"): predictions, _ = scann_searcher.search_batched(vectors) good = tf.sets.intersection(labels, predictions) count_good += len(good.values) count_total += tf.math.reduce_prod(labels.shape) ratio = count_good / count_total if i % FLAGS.print_every_n_batches == 0 and i != 0: LOGGER.debug("Recall so far: %f %%", 100 * ratio) final_recall = count_good / count_total LOGGER.debug( "Final recall for mode `%(mode)s` with `%(num_neighbors)d` " "neighbors: %(recall)f %%", dict(mode=FLAGS.mode, num_neighbors=FLAGS.num_neighbors, recall=100 * final_recall)) LOGGER.debug("%d true positives over %d points.", count_good, count_total) ############################################################################## # Build the output object and save it. ############################################################################## output = {} output["flags"] = { flag.name: flag.value for flag in FLAGS.flags_by_module_dict()[argv[0]] } output["recall"] = float(final_recall) # Redundant but easier to read output["count_goods"] = int(count_good) output["count_total"] = int(count_total) output_path = os.path.join( FLAGS.output_dir, "test_recall_" + time.strftime("results_%Y%m%d-%H%M%S.json")) utils.to_json_file(output_path, output)
def main(argv): ####################################################################### # Initial Setup. Logging, Flags, Random seeds. ####################################################################### if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() flags_dict = { flag.name: flag.value for flag in FLAGS.flags_by_module_dict()[argv[0]] } if FLAGS.use_subset: message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}" f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET" f"{colorama.Style.RESET_ALL}") LOGGER.warning( message ) utils.log_module_args(LOGGER, argv[0]) if not FLAGS.output_dir.startswith("gs://"): utils.check_exists(FLAG_OUTPUT_DIR.value) if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value): raise RuntimeError("Output dir needs to be a directory.") tf.random.set_seed(FLAG_RANDOM_SEED.value) np.random.seed(FLAG_RANDOM_SEED.value) # Prepare the instance output directory path and save the config there folder_name = time.strftime( f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S" ) instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value, folder_name).strip() if not instance_output_dir.endswith("/"): instance_output_dir += "/" json_target = os.path.join(instance_output_dir, "training_params.json") if not json_target.strip().startswith("gs://"): subprocess.check_call(["mkdir", "-p", instance_output_dir]) utils.to_json_file(json_target, instance_output_dir) ############################################################################## # Initialization and Configuration of the Devices. ############################################################################## tpu_setup = None # current_acelerator_type is always "CPU" in the beginning with TPUs if tf_utils.current_accelerator_type() == "CPU": tpu_setup = tf_utils.init_tpus() LOGGER.debug("Devices we are computing on:\n%s", utils.wrap_iterable(map(str, tf_utils.devices_to_use()))) LOGGER.debug("All devices:") LOGGER.debug(tf_utils.device_mapping()) if tf_utils.current_accelerator_type() == "GPU": tf.config.set_soft_device_placement(True) if tf_utils.current_accelerator_type() != "TPU": tf.debugging.set_log_device_placement(True) if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES: actual_num_replicas = len(tf_utils.devices_to_use()) elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: actual_num_replicas = FLAG_NUM_REPLICAS.value else: actual_num_replicas = 1 ############################################################################## # We load the retriever model if it is needed. ############################################################################## # Not currently used. retriever = None if (FLAG_APPROACH_TYPE.value == constants.ApproachTypeChoices.lm_and_realm): config_path = FLAG_RETRIEVER_CONFIG_PATH.value realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path)) # Approx 15 min when not in dev mode, on CPU with utils.log_duration(LOGGER, "main", "whole of BERTScaNNRetriever.__init__", logging.INFO): scann_config = retrievers.ScannConfig( **utils.from_json_file(FLAG_SCANN_CONFIG_PATH.value)) retriever = retrievers.BERTScaNNRetriever( retriever_module_path=realm_save.query_embedder_path, block_records_path=realm_save.text_records, num_block_records=realm_save.num_block_records, mode=tf.estimator.ModeKeys.EVAL, scann_config=scann_config) elif (FLAG_APPROACH_TYPE.value == constants.ApproachTypeChoices.cached_realm): config_path = FLAG_RETRIEVER_CONFIG_PATH.value realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path)) # Approx 15 min when not in dev mode, on CPU with utils.log_duration(LOGGER, "main", "whole of FullyCachedRetriever.__init__", logging.INFO): retriever = retrievers.FullyCachedRetriever( db_path=FLAG_FULLYCACHED_H5_PATH.value, block_records_path=realm_save.text_records, num_block_records=realm_save.num_block_records, ) ############################################################################## # Distributed training task ############################################################################## if FLAG_TASK.value == constants.TaskChoices.train: with utils.log_duration(LOGGER, "main", "Load model"): utils.print_mem("before loading model", LOGGER) model_specific = task_specific.load_model(FLAG_MODEL_LOAD_PATH.value, FLAG_MODEL_KEY.value, FLAG_DISTRIBUTE_MODE.value, tpu_setup, FLAG_NUM_REPLICAS.value) utils.print_mem("after loading model", LOGGER) model_or_replicas = model_specific.model if isinstance(model_or_replicas, list): model_or_replicas: List[transformers.TFGPT2LMHeadModel] else: model_or_replicas: transformers.TFGPT2LMHeadModel tokenizer = model_specific.tokenizer def make_optimizer(): return tensor2tensor.utils.adafactor.AdafactorOptimizer( learning_rate=FLAG_LEARNING_RATE.value) if model_specific.strategy: with model_specific.strategy.scope(): optimizer = make_optimizer() else: optimizer = make_optimizer() ############################################################################ # Prepare the dataset functions ############################################################################ rg = np.random.default_rng(FLAG_RANDOM_SEED.value) def call_lm_preproc( repeat, split, random_seed ): """Using functools.partial prevents the linter from doing its job.""" if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5: return task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=( model_or_replicas[0].config.n_positions if isinstance(model_or_replicas, list) else model_or_replicas.config.n_positions ), dataset_name=FLAG_DATASET_NAME.value, # Batches are split over the replicas: batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas, db_path=FLAG_DB_PATH.value, random_seed=random_seed, use_subset=FLAG_USE_SUBSET.value, subset_size=FLAG_SUBSET_SIZE.value, use_helper_words=FLAG_USE_HELPER_WORDS.value, approach_type=FLAG_APPROACH_TYPE.value, num_retrievals=FLAG_NUM_RETRIEVALS.value, retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value, retriever=retriever, repeat=repeat, split=split, enable_debug_checks=FLAG_DATASET_DEBUG.value, retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value, dataset_type=FLAG_DATASET_TYPE.value, qty_shuffle=FLAG_QTY_SHUFFLE.value, tfr_prefix=FLAG_TFR_PREFIX.value, max_length_generation=FLAG_MAX_LENGTH_GENERATION.value, ) else: raise NotImplementedError( f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`" ) make_training_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial( call_lm_preproc, split="train", repeat=False, ) make_eval_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial( call_lm_preproc, split="eval", repeat=True, ) ############################################################################ # Prepare the step functions ############################################################################ utils.check_contained( FLAG_DISTRIBUTE_MODE.value, constants.DistributeModeChoices.choices() ) tf_function_flags = dict( experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value, experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value ) if (FLAG_DISTRIBUTE_MODE.value == constants.DistributeModeChoices.split_and_data_parallel): if not isinstance(model_or_replicas, list): raise RuntimeError(type(model_or_replicas)) training_step = build_manual_data_parallel_training_step( model_or_replicas, optimizer, tf_function_flags ) else: training_step = build_regular_training_step( model_or_replicas, optimizer, strategy=model_specific.strategy, tf_function_kwargs=tf_function_flags ) evaluation_step = build_evaluation_step( model_or_replicas, tf_function_flags ) secs_since_last_ckpt = time.time() # Model checkpoints are saved to the tmp_directory and then rsynced to GCS ########################################################################## # Prepare the different logging facilities ########################################################################## train_log_dir = os.path.join(instance_output_dir, "tensorboard", "train") eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval") flags_log_dir = os.path.join(instance_output_dir, "tensorboard", "params") writers = dict( train=tf.summary.create_file_writer(train_log_dir), eval=tf.summary.create_file_writer(eval_log_dir), flags=tf.summary.create_file_writer(flags_log_dir) ) with writers["flags"].as_default(): tf.summary.text( "Flags", # Tensorboard takes Markdown: json.dumps(flags_dict, indent=4).replace("\n", "\n\n"), step=0 ) ma_loss = dict( train=utils.MovingAverage(0.9), eval=utils.MovingAverage(0.9) ) step_counters = dict(train=0, eval=0) batch_counters = dict(train=0, eval=0) prev_batch_end = time.time() # The eval ds has no real concept of epoch, repeats forever, shuffling # each time it reaches its end with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"): eval_ds_instance = make_eval_dataset( random_seed=rg.integers(-2**63, 2**63 - 1), ) LOGGER.debug("Distributing the eval dataset to the replicas.") if FLAG_DATASET_TYPE.value == "tfr": eval_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( eval_ds_instance ) ) LOGGER.debug("Done distributing the eval dataset to the replcias.") eval_ds_instance = iter(eval_ds_instance) ########################################################################## # Training Loop ########################################################################## for epoch in itertools.count(): #################################################################### # Epoch Setup #################################################################### LOGGER.debug("EPOCH %d START", epoch) # Shuffle differently every epoch with utils.log_duration( LOGGER, "main", "All of make_training_dataset" ): train_ds_instance = make_training_dataset( random_seed=rg.integers(-2**63, 2**63 - 1), ) LOGGER.debug( "Attempting to distribute the training dataset to the replicas." ) if FLAG_DATASET_TYPE.value == "tfr": train_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( train_ds_instance ) ) LOGGER.debug( "Done distributing the training dataset to the replicas." ) train_ds_instance = iter(train_ds_instance) # This allows us to see if we reached the end of the training iterator, # in which case "did_at_least_one_training_batch == False". # We could also test that it did all the batches, to similar results. did_at_least_one_training_batch = True split = "eval" while did_at_least_one_training_batch: # Invert split if split == "train": split = "eval" else: split = "train" # Prepare to test if we did at least one training batch if split == "train": did_at_least_one_training_batch = False if split == "train": dataset_iterator = itertools.islice( train_ds_instance, FLAG_BATCHES_BETWEEN_EVALS.value ) else: # The evaluation DS is tiny, so we reshuffle and take a random dataset_iterator = itertools.islice( eval_ds_instance, FLAG_NUMBER_EVAL_BATCHES.value ) LOGGER.debug("Batching") for batch in dataset_iterator: # LOGGER.debug("Input sentence:\n\"%s\"", # tokenizer.decode([x for x in batch["input_ids"][0] # if x != tokenizer.eos_token_id])) # LOGGER.debug("Label:\n\"%s\"", # tokenizer.decode([(x if x != -100 else 0) # for x in batch["label_ids"][0]])) if FLAG_DATASET_TYPE.value != "tfr": batch = ( model_specific.strategy .experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch) )) # We only care about training epochs as, obviously, we don't train # over eval samples; the number of eval samples seen only # contributes to lowering the variance in the evaluation of when to # do early stopping. if split == "train": did_at_least_one_training_batch = True input_ids = batch["input_ids"] label_ids = batch["label_ids"] #################################################################### # Training Step #################################################################### step_counters[split] += ( FLAG_BATCH_SIZE.value * actual_num_replicas ) if split == "train": batch_counters[split] += 1 training_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) if model_specific.strategy: utils.print_mem("before running", LOGGER) LOGGER.debug("Training, Calling strategy.run") loss = model_specific.strategy.run( training_step, kwargs=training_kwargs ) LOGGER.debug("Training, Done with strategy.run") utils.print_mem("after running", LOGGER) else: loss = training_step(**training_kwargs) # pytype: disable=wrong-arg-count # If we are in the strategy-free data parallel mode, we need # to change the weights of all replicas to those of the model at # index 0 if ( FLAG_DISTRIBUTE_MODE.value == constants.DistributeModeChoices.split_and_data_parallel ): for replica in model_or_replicas[1:]: replica.set_weights(model_or_replicas[0].get_weights()) #################################################################### # Evaluation Step #################################################################### elif split == "eval": evaluation_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) if model_specific.strategy: loss = model_specific.strategy.run( evaluation_step, kwargs=evaluation_kwargs ) else: loss = evaluation_step(**evaluation_kwargs) else: raise ValueError(f"Unexpected value for split: {split}") #################################################################### # Logging #################################################################### if (FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES): utils.check_equal(len(loss.values), actual_num_replicas) LOGGER.debug("Split: %s", split) LOGGER.debug("Real num replicas: %s", actual_num_replicas) LOGGER.debug("Loss: %s", loss) LOGGER.debug("Loss values: %s", loss.values) average_loss = float(tf.math.reduce_mean(loss.values).numpy()) else: average_loss = float(loss.numpy()) # tf.debugging.check_numerics(loss) now = time.time() batch_duration = now - prev_batch_end prev_batch_end = now ma_loss[split].update(average_loss) # Actual logging LOGGER.info("Epoch: # %d", epoch) LOGGER.info("Tensorboard_dir: %s", instance_output_dir) LOGGER.info("Batch: %s # %d", split, batch_counters[split]) LOGGER.info("Step: %s # %d", split, step_counters[split]) if FLAG_USE_SUBSET.value: LOGGER.warning(">> USING A SUBSET OF THE DATASET <<") LOGGER.info( "%(split)s Batch loss: %(metric)f", dict(split=split, metric=average_loss) ) LOGGER.info( "%(split)s Moving average loss: %(metric)f", dict(split=split, metric=ma_loss[split].average) ) LOGGER.info( "%(split)s Moving average ppl: %(metric)f", dict(split=split, metric=np.exp(ma_loss[split].average)) ) LOGGER.info( "%(split)s Batch duration: %(duration)s", dict( split=split, duration=utils.TimeStamp.from_seconds( batch_duration).format() ) ) if FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: LOGGER.info( "%(split)s Duration per sample: %(duration)s", dict( split=split, duration=utils.TimeStamp.from_seconds( batch_duration / ( FLAG_BATCH_SIZE.value * actual_num_replicas ) ) ) ) # Write to Tensorboard with writers[split].as_default(): tf.summary.scalar( f"Loss/{split}", average_loss, step_counters[split] ) tf.summary.scalar( f"PPL/{split}", np.exp(average_loss), step_counters[split] ) writers[split].flush() # Save every 5 min if (time.time() - secs_since_last_ckpt) / (60 * 20) >= 1: secs_since_last_ckpt = time.time() save_model( train_steps=step_counters["train"], model_or_replicas=model_or_replicas, instance_output_dir=instance_output_dir ) secs_since_last_ckpt = time.time() save_model( train_steps=step_counters["train"], model_or_replicas=model_or_replicas, instance_output_dir=instance_output_dir ) ############################################################# # Post Training Cleanup ####################################################################### for writer in writers.values(): writer.close()
os.makedirs(NOBG_DIR) def removebg(image): response = requests.post( 'https://api.remove.bg/v1.0/removebg', files={'image_file': utils.image_to_bytes(image)}, data={'size': 'regular'}, headers={'X-Api-Key': API_KEY}, ) if response.status_code == requests.codes.ok: return Image.open(io.BytesIO(response.content)) else: print("Error:", response.status_code, response.text) if __name__ == "__main__": data = utils.from_json_file(os.path.join(JSON_DIR, "data.json")) input = utils.from_json_file(os.path.join(DEBUG_DIR, "input.json")) ngbg = [] for face in input["faces"]: cropped = utils.get_cropped_image(data, face, config.FGFI_DIR) ngbg.append({ "info": face, "image_data": base64.b64encode(removebg(cropped, face)).decode("utf-8") }) utils.save_to_json_file(os.path.join(OUTPUT_DIR, "nobg.json"), json.dumps(ngbg))
def place_wt(the_shape, the_radial): """ Place count based weighting used to post process the polygon data set :param: the_shape :param: the_radial """ p_mod = PostProcess(shape=the_shape, radial=the_radial) #u_mod = Util(shape=the_shape, radial=the_radial) shape_and_size = p_mod.shape +'_' + str(p_mod.radial) aust_shape_file_name = 'aust_' + p_mod.shape +'_shape_' + \ str(p_mod.radial) +'km' gj_name = 'aus_' + shape_and_size + 'km_layer' vrt_ref = 'all_' +shape_and_size vrt_file = 'all_' + shape_and_size + '.vrt' feat_sa1_11 = 'feat_aust_' + str(p_mod.radial) +'km_sa1_11' feat_sa1_16 = 'feat_aust_' + str(p_mod.radial) +'km_sa1_16' db_name = 'db_place_' + shape_and_size tabular_sql_name = 'tabular_place_wt_' + shape_and_size + '.txt' output_shape = shape_and_size + 'km_place_11_16' p_mod.vrt_shape_and_size('vrt', 'template.vrt', vrt_file) p_mod.do_spatialite('table_goes_here.txt', db_name) datasets = from_json_file('datasets', p_mod.json_files_path, p_mod.slash) ref_data = datasets['DataSets']['Australia']['ShapeFormat'] file_deploy(ref_data) ref_data = datasets['DataSets']['StatisticalAreasLevel12011']['ShapeFormat'] file_deploy(ref_data) ref_data = datasets['DataSets']['StatisticalAreasLevel12016']['ShapeFormat'] file_deploy(ref_data) ref_data = datasets['DataSets']['AGILDataset']['CSVFormat'] file_deploy(ref_data) ref_data = datasets['DataSets']['OpenStreetMaps']['ShapeFormat'] file_deploy(ref_data) #ref_files_poly_wt('datasets',p_mod.json_files_path, p_mod.slash) print('aust_shape') p_mod.geojson_to_shp(gj_name, aust_shape_file_name, 4283) p_mod.shp_to_db(aust_shape_file_name, db_name, aust_shape_file_name, 4823) print('feat_aust_11_area') p_mod.sql_to_ogr('feat_aust_11', vrt_ref, feat_sa1_11) p_mod.shp_to_db(feat_sa1_11, db_name, feat_sa1_11, 4823) print('feat_aust_16_area') p_mod.sql_to_ogr('feat_aust_16', vrt_ref, feat_sa1_16) p_mod.shp_to_db(feat_sa1_16, db_name, feat_sa1_16, 4823) print('tabular_place_wt') p_mod.csv_to_db('2011Census_B18_AUST_SA1_long',\ db_name, '2011Census_B18_AUST_SA1_long') p_mod.csv_to_db('2011Census_B21_AUST_SA1_long',\ db_name, '2011Census_B21_AUST_SA1_long') p_mod.csv_to_db('2011Census_B22B_AUST_SA1_long',\ db_name, '2011Census_B22B_AUST_SA1_long') p_mod.csv_to_db('2016Census_G18_AUS_SA1',\ db_name, '2016Census_G18_AUS_SA1') p_mod.csv_to_db('2016Census_G21_AUS_SA1',\ db_name, '2016Census_G21_AUS_SA1') p_mod.csv_to_db('2016Census_G22B_AUS_SA1',\ db_name, '2016Census_G22B_AUS_SA1') file_name = 'aust_{shape}_shape_{size}km'.\ format(shape=p_mod.shape,\ size=p_mod.radial) p_mod.shp_to_db(file_name, db_name, file_name, 4823) p_mod.shp_to_db(feat_sa1_11, db_name, feat_sa1_11, 4823) p_mod.shp_to_db(feat_sa1_16, db_name, feat_sa1_16, 4823) p_mod.shp_to_db('gis_osm_places_free_1',\ db_name, 'gis_osm_places_free_1', 4823) p_mod.shp_to_db('gis_osm_roads_free_1',\ db_name, 'gis_osm_roads_free_1', 4823) p_mod.sql_to_ogr('shape_pois_shp', vrt_ref, 'POI') p_mod.shp_to_db('POI', db_name, 'POI', 4823) p_mod.shape_and_size('spatialite_db', 'tabular_place_wt.txt',\ tabular_sql_name) p_mod.do_spatialite(tabular_sql_name, db_name) print('shape_11_16_place') p_mod.sql_to_ogr('shape_11_16_place', vrt_ref, output_shape) p_mod.shp_to_geojson(output_shape, output_shape) p_mod.shp_to_kml(output_shape, output_shape)