예제 #1
0
def main():
    data = utils.from_json_file(os.path.join(JSON_DIR, "data.json"))
    input_data = utils.from_json_file(os.path.join(INPUT_DIR, "input.json"))

    good_image_path = data["images"][input_data["image_index"]]["path"]
    good_image = utils.get_image_by_path(good_image_path, config.FGFI_DIR)

    faces_to_replace = get_faces_to_replace(input_data, data)

    for i, face_to_replace in enumerate(faces_to_replace):
        #print(face_to_replace["face"]["image"].size)
        #print(face_to_replace["replacer"]["image"].size)
        #print(face_to_replace["face"]["nobg"].size)
        #print(face_to_replace["replacer"]["nobg"].size)

        width, height = face_to_replace["face"]["image"].size
        radius = math.floor(min(width, height) * 0.018)
        #print("width = {}, height = {}, radius = {}".format(width, height, radius))

        face_data = data["images"][
            face_to_replace["face"]["data"]["image_index"]]["faces"][
                face_to_replace["face"]["data"]["face_index"]]

        good_image, inpainted_mask_image = inpaint(
            good_image, face_to_replace["face"]["mask"].convert("L"),
            face_data, radius)
        mask_image = face_to_replace["face"]["mask"]

        mask_image.save(os.path.join(RESULT_DIR, "mask_{}.png".format(i)))
        good_image.save(os.path.join(RESULT_DIR, "in_{}.png".format(i)))

        x, y = face_to_replace["pos_to_replace"]["x"], face_to_replace[
            "pos_to_replace"]["y"]

        face_image_mask = creatMask.copy_face_to_image_v2(
            good_image, face_to_replace["replacer"]["nobg"], x, y,
            border=True).filter(ImageFilter.GaussianBlur(radius=1))
        face_image = creatMask.copy_face_to_image_v2(
            good_image, face_to_replace["replacer"]["nobg"], x, y, False)

        face_image_mask.save(os.path.join(RESULT_DIR, "fm_{}.png".format(i)))
        face_image.save(os.path.join(RESULT_DIR, "f_{}.png".format(i)))

        good_image = Image.composite(face_image, good_image,
                                     face_image_mask.convert("L"))
        good_image.save(os.path.join(RESULT_DIR, "comp_{}.png".format(i)))
    good_image.show()
def main():
    data = utils.from_json_file(os.path.join(JSON_DIR, "data.json"))
    input_data = utils.from_json_file(os.path.join(INPUT_DIR, "input.json"))

    good_image_path = data["images"][input_data["image_index"]]["path"]
    good_image = utils.get_image_by_path(good_image_path, config.FGFI_DIR)

    faces_to_replace = get_faces_to_replace(input_data, data)

    for i, face_to_replace in enumerate(faces_to_replace):
        inpainted_array, mask_array = inpaintTest.inpaint_image(
            good_image, face_to_replace["face"]["mask"])
        inpainted_image = Image.fromarray(
            skimage.util.img_as_ubyte(inpainted_array))
        mask_image = Image.fromarray(skimage.util.img_as_ubyte(mask_array))
        mask_image.save(os.path.join(RESULT_DIR, "mask_{}.png".format(i)))
        inpainted_image = inpainted_image.resize(good_image.size)

        print(mask_image.size)
        print(good_image.size)
        print(inpainted_image.size)

        good_image = Image.composite(inpainted_image, good_image,
                                     mask_image.convert("L"))
        good_image.save(os.path.join(RESULT_DIR, "in_{}.png".format(i)))

        face_image_mask = creatMask.copy_face_to_image(
            good_image,
            face_to_replace["replacer"]["nobg"],
            data,
            face_to_replace["face"]["image_index"],
            face_to_replace["face"]["face_index"],
            border=True).filter(ImageFilter.GaussianBlur(radius=2))
        face_image = creatMask.copy_face_to_image(
            good_image, face_to_replace["replacer"]["nobg"], data,
            face_to_replace["face"]["image_index"],
            face_to_replace["face"]["face_index"], False)
        face_image_mask.save(os.path.join(RESULT_DIR, "fm_{}.png".format(i)))
        face_image.save(os.path.join(RESULT_DIR, "f_{}.png".format(i)))
        good_image = Image.composite(face_image, good_image,
                                     face_image_mask.convert("L"))
        good_image.save(os.path.join(RESULT_DIR, "comp_{}.png".format(i)))
예제 #3
0
def hex(bounds_north, bounds_south, bounds_east, bounds_west, theradial):
    """
    Hexagons specific functions to create hexagon mapping layer

    """
    theshape = 'hex'
    t_mod = Tiles(shape=theshape,
                  north=bounds_north,
                  south=bounds_south,
                  east=bounds_east,
                  west=bounds_west,
                  radial=theradial)
    #u_mod = Util(shape=theshape, radial=theradial)

    datasets = from_json_file('datasets.json', t_mod.json_files_path)
    ref_data = datasets['DataSets']['Australia']['ShapeFormat']
    file_deploy(ref_data)

    ref_data = datasets['DataSets']['AGILDataset']['CSVFormat']
    file_deploy(ref_data)

    ref_data = datasets['DataSets']['MBSP']['CSVFormat']
    file_deploy(ref_data)

    ref_data = datasets['DataSets']['NASAActiveFireData']['ModisC61km'][
        'CSVFormat']
    file_deploy(ref_data)

    aus_hex_array = t_mod.hexagons()
    nb_aus_hex_array = add_poly_nb(aus_hex_array, "p")

    f_path_shp = os.path.join(
        t_mod.shape_files_path,
        'aus_{}_{}km_layer'.format(theshape, str(int(theradial))))
    f_path_kml = os.path.join(
        t_mod.kml_files_path,
        'aus_{}_{}km_layer'.format(theshape, str(int(theradial))))
    f_path_geojson = os.path.join(
        t_mod.geojson_files_path,
        'aus_{}_{}km_layer'.format(theshape, str(int(theradial))))

    to_shp_file(nb_aus_hex_array, f_path_shp)
    to_kml_file(nb_aus_hex_array, f_path_kml, 'Active_Fires')
    to_geojson_file(nb_aus_hex_array, f_path_geojson)
def main(argv):
    if len(argv) > 1:
        raise RuntimeError(argv)
    absl_logging.use_python_logging()
    utils.log_module_args(LOGGER, argv[0])

    retriever_config = tf_utils.REALMSave(
        **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value))
    assert not _FLAG_USE_SUBSET.value

    time_stamp = time.strftime("%Y%m%d-%H%M%S")
    target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip())
    if target_path[-1] != "/":
        target_path += "/"

    ##############################################################################
    # Setup devices and strategy
    ##############################################################################
    with utils.log_duration(LOGGER, "main", "Initializing devices"):
        tpu_config = tf_utils.init_tpus()
        device_type = tf_utils.current_accelerator_type()
        LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use()))

        if device_type == "TPU":
            if tpu_config is None:
                raise RuntimeError("We should have a tpu_config.")
            strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        elif device_type == "GPU" or device_type == "CPU":
            strategy = tf.distribute.MirroredStrategy()
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        else:
            raise RuntimeError(device_type)

    ##############################################################################
    # Load the dataset.
    ##############################################################################
    eli5 = {}
    keys = ["train", "eval", "test"]
    gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

    with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."):
        for split in tqdm.tqdm(keys):
            load_path = os.path.join(_FLAG_DATASET_ROOT.value,
                                     "HuggingfaceDatasets",
                                     f"{split}_kilt_eli5.hf")
            with tf.device("/job:localhost"):
                eli5[split] = datasets.load_from_disk(load_path)

    ##############################################################################
    #
    ##############################################################################
    with utils.log_duration(LOGGER, "Main", "Load the textual dataset"):
        # Extract the appropriate text
        # The buffer_size is taken from the original ORQA code.
        blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records,
                                                 buffer_size=512 * 1024 * 1024)
        blocks_dataset = blocks_dataset.batch(
            retriever_config.num_block_records, drop_remainder=True)
        blocks = tf.data.experimental.get_single_element(blocks_dataset)

    ############################################################################
    # Prepare the output file.
    ############################################################################
    writers = {}

    all_paths = {}
    for split in keys:
        maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else ""
        paths = [
            os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr")
            for i in range(_FLAG_NUM_SHARDS.value)
        ]
        all_paths[split] = paths
        writers[split] = [tf.io.TFRecordWriter(filename) for filename in paths]

        with utils.log_duration(LOGGER, "main", "Loading the reference db."):
            checkpoint_path = os.path.join(
                retriever_config.query_embedder_path, "encoded",
                "encoded.ckpt")

            reference_db_device = tf_utils.device_mapping().CPUs[0].name
            with tf.device(reference_db_device):
                reference_db = tf_utils.load_reference_db(
                    checkpoint_path,
                    variable_name="block_emb",
                )

    ############################################################################
    # Prep the encoder and the tokenizer
    ############################################################################
    with utils.log_duration(LOGGER, "main",
                            "Loading the encoder model and the tokenizer."):
        with strategy.scope():
            query_encoder = hub.load(retriever_config.query_embedder_path,
                                     tags={})
        encode_fn = _make_encode_fn(query_encoder)
        encode_fn_strategy_run = make_encode_fn_strategy_run_fn(
            strategy=strategy,
            encode_fn=encode_fn,
        )

        vocab_file = os.path.join(retriever_config.query_embedder_path,
                                  "assets", "vocab.txt")
        utils.check_exists(vocab_file)
        do_lower_case = query_encoder.signatures["tokenization_info"](
        )["do_lower_case"]
        tokenization_info = dict(vocab_file=vocab_file,
                                 do_lower_case=do_lower_case)

        tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(
            query_encoder, tokenization_info)

    ############################################################################
    # Preprocess the dataset
    ############################################################################
    cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")),
                           tf.int32)
    sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")),
                           tf.int32)
    transform = _make_transform_fn(
        bert_tokenizer=tokenizer,
        bert_cls_token_id=cls_token_id,
        bert_sep_token_id=sep_token_id,
    )

    feature_dtypes = {
        constants.CTH5Fields.distances: tf.float32,
        constants.CTH5Fields.gpt2_retrieved_ids: tf.int32,
        constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32,
        constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32,
    }

    with utils.log_duration(LOGGER, "main", "generating codes"):
        for split in keys:
            sample_count = 0
            eli5: Dict[str, datasets.Dataset]

            if split != "test":
                for_slices = dict(sample_id=eli5[split]["id"],
                                  question=eli5[split]["input"],
                                  answer=[
                                      sample["answer"][0]
                                      for sample in eli5[split]["output"]
                                  ])
            else:
                for_slices = dict(
                    sample_id=eli5[split]["id"],
                    question=eli5[split]["input"],
                )

            ds = tf.data.Dataset.from_tensor_slices(for_slices)
            ds = ds.map(transform,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            ds = ds.apply(
                tf.data.experimental.dense_to_ragged_batch(batch_size))
            ds = ds.map(_squeeze,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            tqdm_inner = tqdm.tqdm(enumerate(ds),
                                   total=len(eli5[split]["id"]) //
                                   _FLAG_BATCH_SIZE.value,
                                   desc=f"Split `{split}`: Batches")

            for i, batch in tqdm_inner:
                features = collections.defaultdict(list)

                ######################################################################
                # Enforce the current real batch size
                ######################################################################
                current_batch_size = batch["sample_id"].shape[0]
                for k, v in batch.items():
                    utils.check_equal(v.shape[0], current_batch_size)
                ######################################################################

                gpt2_question_ids_inputs = _prep_field(batch["question"],
                                                       gpt2_tokenizer)
                utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32)
                utils.check_equal(gpt2_question_ids_inputs.shape[0],
                                  current_batch_size)

                if split != "test":
                    gpt2_answer_ids_inputs = _prep_field(
                        batch["answer"], gpt2_tokenizer)
                    utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32)
                    utils.check_equal(gpt2_answer_ids_inputs.shape[0],
                                      current_batch_size)

                    assert len(gpt2_answer_ids_inputs.shape) == 2, (
                        gpt2_answer_ids_inputs.shape)

                ######################################################################
                # Save the gpt2 tokenized question and answer
                ######################################################################

                features[constants.CTH5Fields.gpt2_question_ids_inputs].extend(
                    gpt2_question_ids_inputs)

                if split != "test":
                    features[
                        constants.CTH5Fields.gpt2_answer_ids_inputs].extend(
                            gpt2_answer_ids_inputs)

                ######################################################################
                # Encode the samples.
                ######################################################################
                batch = strategy.experimental_distribute_values_from_function(
                    tf_utils.make_dict_distribute_fn(batch))

                embeddings = encode_fn_strategy_run(batch)
                embeddings = tf_utils.process_strat_output(
                    embeddings, "embeddings", strategy, current_batch_size)
                utils.check_isinstance(embeddings, ops.EagerTensor)
                utils.check_equal(embeddings.shape[0], current_batch_size)

                # pytype doesn't seem to see that we check the type
                utils.check_equal(embeddings.shape[1],
                                  _FLAG_EMBEDDING_DEPTH.value)  # pytype: disable=attribute-error

                ######################################################################
                # Retrieve.
                ######################################################################
                with tf.device(reference_db_device):
                    top_k, inner_prods = tf_utils.mips_exact_search(
                        embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db)
                top_k = tf_utils.process_strat_output(top_k, "top_k", strategy,
                                                      current_batch_size)
                utils.check_equal(
                    inner_prods.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))
                utils.check_equal(
                    top_k.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))

                features[constants.CTH5Fields.distances].extend(inner_prods)

                gathered = tf.gather(blocks, top_k).numpy()
                utils.check_equal(gathered.shape[0], current_batch_size)
                retrievals = []
                for j in range(gathered.shape[0]):
                    local_gathered = gathered[j].tolist()
                    utils.check_equal(len(local_gathered),
                                      _FLAG_NUM_RETRIEVALS.value)
                    local_gathered = [
                        sample.decode() for sample in local_gathered
                    ]
                    token_ids = np.array(
                        gpt2_tokenizer.batch_encode_plus(
                            local_gathered,
                            padding="max_length",
                            truncation=True,
                        ).input_ids)
                    for line in token_ids:
                        assert not np.all(line == 0), line

                    token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1
                    retrievals.append(token_ids)
                features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals

                utils.check_equal(
                    retrievals[0].shape,
                    (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value))

                for k, v in features.items():
                    utils.check_equal(len(v), current_batch_size)

                for k in range(current_batch_size):
                    feature = tf.train.Features(
                        feature={
                            k: _bytes_feature(
                                tf.io.serialize_tensor(
                                    tf.cast(v[k], feature_dtypes[k])))
                            for k, v in features.items()
                        })

                    writers[split][
                        sample_count % _FLAG_NUM_SHARDS.value].write(
                            tf.train.Example(
                                features=feature).SerializeToString())
                    sample_count += 1
                if sample_count % 1000 == 0:
                    LOGGER.debug("Paths: %s", str(all_paths[split][0]))

    LOGGER.debug("Done.")
예제 #5
0
                    outline="#000000")

    img2.paste(without_bg_img, top_left)

    return img2


if __name__ == "__main__":
    OUTPUT_DIR = os.path.join(config.IHS_DIR, "output")
    DEBUG_DIR = os.path.join(config.IHS_DIR, "debug")
    FGFI_OUTPUT__DIR = os.path.join(config.FGFI_DIR, "debug")

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    nobg = utils.from_json_file(os.path.join(DEBUG_DIR, "nobg.json"))

    #get path for original image
    image_index = nobg[1]["info"]["image_index"]
    face_index = nobg[1]["info"]["face_index"]
    data = utils.from_json_file(os.path.join(FGFI_OUTPUT__DIR, "data.json"))
    path_for_image = data["images"][image_index]["path"]
    img = utils.get_image_by_path(path_for_image, config.FGFI_DIR)
    img = img.convert("RGBA")
    width, height = img.size

    ##get image after removed bg
    nobg = utils.from_json_file(os.path.join(DEBUG_DIR, "nobg.json"))
    image_from = utils.base64_image_to_image(nobg[1]["image_data"])

    creat_mask(img, image_from, data)
예제 #6
0
def main(argv):
    # Arguments and logging boilerplate
    if len(argv) > 1:
        raise RuntimeError(argv)

    absl_logging.use_python_logging()
    utils.log_module_args(LOGGER, argv[0])

    # Load a retriever config.
    retriever_config = tf_utils.REALMConfig(
        **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value))
    assert not _FLAG_USE_SUBSET.value

    # Preparation of the output path
    time_stamp = time.strftime("%Y%m%d-%H%M%S")
    target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip())
    if target_path[-1] != "/":
        target_path += "/"

    ##############################################################################
    # Setup devices and strategy
    ##############################################################################
    # Duration is pretty much instantaneous
    with utils.log_duration(LOGGER, "main", "Initializing devices"):
        tpu_config = tf_utils.init_tpus(local=_FLAG_TPU_IS_LOCAL.value,
                                        tpu_name=_FLAG_TPU_NAME.value)
        device_type = tf_utils.current_accelerator_type()
        LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use()))
        if _FLAG_TPU_NAME.value and device_type == "CPU":
            raise RuntimeError("Device is CPU and we expected a TPU.")

        if device_type == "TPU":
            if tpu_config is None:
                raise RuntimeError("We should have a tpu_config.")
            strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        elif device_type == "GPU" or device_type == "CPU":
            strategy = tf.distribute.MirroredStrategy()
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        else:
            raise RuntimeError(device_type)

    ##############################################################################
    # Load the KILT ELI5 dataset.
    ##############################################################################
    # Takes a while
    eli5 = {}
    keys = ["train", "validation", "test"]
    gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

    with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."):
        if _FLAG_DATASET_ROOT.value:
            for split in tqdm.tqdm(keys):
                load_path = os.path.join(_FLAG_DATASET_ROOT.value,
                                         "HuggingfaceDatasets",
                                         f"{split}_kilt_eli5.hf")
                with tf.device("/job:localhost"):
                    eli5[split] = datasets.load_from_disk(load_path)
        else:
            eli5 = datasets.load_dataset("kilt_tasks", "eli5")

    ##############################################################################
    # Load the dataset of the text that will be retrieved.
    ##############################################################################
    # Takes a long time
    with utils.log_duration(LOGGER, "Main", "Load the textual dataset"):
        # Extract the appropriate text
        # The buffer_size is taken from the original ORQA code.
        blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records,
                                                 buffer_size=512 * 1024 * 1024)
        blocks_dataset = blocks_dataset.batch(
            retriever_config.num_block_records, drop_remainder=False)
        blocks: tf.Tensor = tf.data.experimental.get_single_element(
            blocks_dataset)

    ############################################################################
    # Increase the number of maximum open file descriptors to make space
    # for all the shards.
    ############################################################################
    max_num_fd = _FLAG_NUM_SHARDS.value * 3 + _MIN_N_FD
    resource.setrlimit(resource.RLIMIT_NOFILE, (max_num_fd, max_num_fd))

    ############################################################################
    # Prepare the output files.
    ############################################################################
    writers = {}
    all_paths = {}

    for split in keys:
        maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else ""
        # Prepare paths. They can't be in a generator. A function generator would be
        # fine though.
        paths = [
            os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr")
            for i in range(_FLAG_NUM_SHARDS.value)
        ]
        all_paths[split] = paths
        writers[split] = []

        # Create The TFR writers.
        for i, path in enumerate(paths):
            writers[split].append(tf.io.TFRecordWriter(path))

    # Load the reference DB. We used to accidentally do this once per split :O
    with utils.log_duration(LOGGER, "main", "Loading the reference db."):
        checkpoint_path = os.path.join(retriever_config.query_embedder_path,
                                       "encoded", "encoded.ckpt")
        reference_db_device = tf_utils.device_mapping().CPUs[0].name
        with tf.device(reference_db_device):
            reference_db = tf_utils.load_reference_db(
                checkpoint_path,
                variable_name="block_emb",
            )

    ############################################################################
    # Prep the encoder and the tokenizer
    ############################################################################
    with utils.log_duration(LOGGER, "main",
                            "Loading the encoder model and the tokenizer."):
        with strategy.scope():
            query_encoder = hub.load(retriever_config.query_embedder_path,
                                     tags={})
        encode_fn = _make_encode_fn(query_encoder)
        encode_fn_strategy_run = make_encode_fn_strategy_run_fn(
            strategy=strategy,
            encode_fn=encode_fn,
        )

        vocab_file = os.path.join(retriever_config.query_embedder_path,
                                  "assets", "vocab.txt")
        utils.check_exists(vocab_file)
        do_lower_case = query_encoder.signatures["tokenization_info"](
        )["do_lower_case"]
        tokenization_info = dict(vocab_file=vocab_file,
                                 do_lower_case=do_lower_case)

        tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(
            query_encoder, tokenization_info)

    ############################################################################
    # Preprocess the dataset
    ############################################################################
    cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")),
                           tf.int32)
    sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")),
                           tf.int32)
    transform = _make_transform_fn(
        bert_tokenizer=tokenizer,
        bert_cls_token_id=cls_token_id,
        bert_sep_token_id=sep_token_id,
    )

    feature_dtypes = {
        constants.CTH5Fields.distances: tf.float32,
        constants.CTH5Fields.gpt2_retrieved_ids: tf.int32,
        constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32,
        constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32,
    }

    with utils.log_duration(LOGGER, "main", "generating codes"):
        for split in keys:
            sample_count = 0
            eli5: Dict[str, datasets.Dataset]

            if split != "test":
                for_slices = dict(sample_id=eli5[split]["id"],
                                  question=eli5[split]["input"],
                                  answer=[
                                      sample[0]["answer"]
                                      for sample in eli5[split]["output"]
                                  ])
            else:
                for_slices = dict(
                    sample_id=eli5[split]["id"],
                    question=eli5[split]["input"],
                )

            ds = tf.data.Dataset.from_tensor_slices(for_slices)
            ds = ds.map(transform,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            ds = ds.apply(
                tf.data.experimental.dense_to_ragged_batch(batch_size))
            ds = ds.map(_squeeze,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            tqdm_inner = tqdm.tqdm(enumerate(ds),
                                   total=len(eli5[split]["id"]) //
                                   _FLAG_BATCH_SIZE.value,
                                   desc=f"Split `{split}`: Batches")

            for i, batch in tqdm_inner:
                features = collections.defaultdict(list)

                ######################################################################
                # Enforce the current real batch size
                ######################################################################
                current_batch_size = batch["sample_id"].shape[0]
                for k, v in batch.items():
                    utils.check_equal(v.shape[0], current_batch_size)
                ######################################################################

                gpt2_question_ids_inputs = _prep_field(batch["question"],
                                                       gpt2_tokenizer)
                utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32)
                utils.check_equal(gpt2_question_ids_inputs.shape[0],
                                  current_batch_size)

                if split != "test":
                    gpt2_answer_ids_inputs = _prep_field(
                        batch["answer"], gpt2_tokenizer)
                    utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32)
                    utils.check_equal(gpt2_answer_ids_inputs.shape[0],
                                      current_batch_size)

                    assert len(gpt2_answer_ids_inputs.shape) == 2, (
                        gpt2_answer_ids_inputs.shape)

                ######################################################################
                # Save the gpt2 tokenized question and answer
                ######################################################################

                features[constants.CTH5Fields.gpt2_question_ids_inputs].extend(
                    gpt2_question_ids_inputs)

                if split != "test":
                    features[
                        constants.CTH5Fields.gpt2_answer_ids_inputs].extend(
                            gpt2_answer_ids_inputs)

                ######################################################################
                # Encode the samples.
                ######################################################################
                batch = strategy.experimental_distribute_values_from_function(
                    tf_utils.make_dict_distribute_fn(batch))

                embeddings = encode_fn_strategy_run(batch)
                embeddings = tf_utils.process_strat_output(
                    embeddings, "embeddings", strategy, current_batch_size)
                utils.check_isinstance(embeddings, ops.EagerTensor)
                utils.check_equal(embeddings.shape[0], current_batch_size)

                # pytype doesn't seem to see that we check the type
                utils.check_equal(embeddings.shape[1],
                                  _FLAG_EMBEDDING_DEPTH.value)  # pytype: disable=attribute-error

                ######################################################################
                # Retrieve.
                ######################################################################
                # Do exact retrieval
                with tf.device(reference_db_device):
                    top_k, inner_prods = tf_utils.mips_exact_search(
                        embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db)

                # Collate the results
                top_k = tf_utils.process_strat_output(top_k, "top_k", strategy,
                                                      current_batch_size)

                # Check the shapes
                utils.check_equal(
                    inner_prods.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))
                utils.check_equal(
                    top_k.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))

                # Save the distances
                features[constants.CTH5Fields.distances].extend(inner_prods)

                # Retrieve the text fields associated to the indices
                gathered = tf.gather(blocks, top_k).numpy()
                utils.check_equal(gathered.shape[0], current_batch_size)
                utils.check_equal(gathered.shape[1],
                                  _FLAG_NUM_RETRIEVALS.value)

                retrievals = []
                for index_in_batch in range(current_batch_size):
                    # Put the appropriate byte strings in a list
                    local_gathered = gathered[index_in_batch].tolist()
                    utils.check_equal(len(local_gathered),
                                      _FLAG_NUM_RETRIEVALS.value)
                    # Decode to utf-8
                    local_gathered = [
                        sample.decode() for sample in local_gathered
                    ]
                    # Encode to GPT2 BPE
                    token_ids = np.array(
                        gpt2_tokenizer.batch_encode_plus(
                            local_gathered,
                            padding="max_length",
                            truncation=True,
                        ).input_ids)

                    # Make sure no line is empty
                    # TODO(julesgm): Maybe optional
                    for line in token_ids:
                        assert not np.all(line == 0), line

                    # Convert the eos_tokens
                    token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1

                    # Save the retrievals
                    retrievals.append(token_ids)

                # Save the feature
                features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals

                utils.check_equal(
                    retrievals[0].shape,
                    (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value))

                for k, v in features.items():
                    utils.check_equal(len(v), current_batch_size)

                for index_in_batch in range(current_batch_size):
                    feature_dict = {}
                    for feature_k, feature_v in features.items():
                        # Cast the feature to its appropriate dtype
                        casted_feats = tf.cast(feature_v[index_in_batch],
                                               feature_dtypes[feature_k])
                        # Serialize the tensor to bytes
                        feature_bytes = tf.io.serialize_tensor(casted_feats)
                        # Build a bytes list tf.train.Feature object,
                        # the serialization tree node
                        feature_dict[feature_k] = _bytes_feature(feature_bytes)

                    # Create the serialization tree root
                    # Expects a list of features
                    feature = tf.train.Features(feature=feature_dict)
                    # Expects a tf.train.Features object
                    example_obj = tf.train.Example(features=feature)

                    # Serialize that to bytes
                    serialized_example = example_obj.SerializeToString()

                    # Write the bytes
                    # TODO(julesgm): Parallelize this with a thread or a process pool &
                    #   futures.
                    writers[split][sample_count %
                                   _FLAG_NUM_SHARDS.value].write(
                                       serialized_example)
                    sample_count += 1

                if sample_count % 1000 == 0:
                    LOGGER.debug("Paths: %s", str(all_paths[split][0]))

            LOGGER.debug("Flushing and closing the `%s` writers", split)
            for writer in tqdm.tqdm(writers[split]):
                writer.flush()
                writer.close()

    LOGGER.debug("Done.")
예제 #7
0
def main(argv):
    if len(argv) > 1:
        raise RuntimeError(argv)
    absl_logging.use_python_logging()
    retriever_config = tf_utils.REALMSave(
        **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value))

    extra = "_FROM_SUBSET" if _FLAG_USE_SUBSET.value else ""
    time_stamp = time.strftime("%Y%m%d-%H%M%S")
    target_path = os.path.join(_FLAG_OUTPUT_PATH.value,
                               time_stamp + extra).strip()
    if target_path[-1] != "/":
        target_path += "/"

    ##############################################################################
    # Setup devices and strategy
    ##############################################################################
    with utils.log_duration(LOGGER, "main", "Initializing devices"):
        tpu_config = tf_utils.init_tpus()
        device_type = tf_utils.current_accelerator_type()
        LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use()))

        if device_type == "TPU":
            if tpu_config is None:
                raise RuntimeError("We should have a tpu_config.")
            strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        elif device_type == "GPU" or device_type == "CPU":
            strategy = tf.distribute.MirroredStrategy()
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        else:
            raise RuntimeError(device_type)

    ##############################################################################
    # Load the dataset.
    ##############################################################################
    eli5 = {}
    keys = ["train", "eval", "test"]
    gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

    with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."):
        for split in tqdm.tqdm(keys):
            load_path = os.path.join(_FLAGS_DATASET_ROOT.value,
                                     "HuggingfaceDatasets",
                                     f"{split}_kilt_eli5.hf")
            with tf.device("/job:localhost"):
                eli5[split] = datasets.load_from_disk(load_path)

    if _FLAG_USE_SUBSET.value:
        _warn_subset()

    ##############################################################################
    #
    ##############################################################################
    with utils.log_duration(LOGGER, "Main", "Load the textual dataset"):
        # Extract the appropriate text
        # The buffer_size is taken from the original ORQA code.
        blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records,
                                                 buffer_size=512 * 1024 * 1024)
        blocks_dataset = blocks_dataset.batch(
            retriever_config.num_block_records, drop_remainder=True)
        blocks = tf.data.experimental.get_single_element(blocks_dataset)

    with tempfile.TemporaryDirectory() as tmp_dir:
        ############################################################################
        # Prepare the output file.
        ############################################################################
        tmp_dir = pathlib.Path(tmp_dir)
        h5_output_path = tmp_dir / "codes.h5"
        output_file = h5py.File(h5_output_path, "w")
        flags_dict = {
            flag.name: flag.value
            for flag in flags.FLAGS.flags_by_module_dict()[argv[0]]
        }
        utils.to_json_file(tmp_dir / "params.json", flags_dict)

        for split in keys:
            with utils.log_duration(
                    LOGGER, "main",
                    "Creating the output hdf5 file, embeddings."):
                num_entries = len(eli5[split]["id"])
                if _FLAG_USE_SUBSET.value:
                    num_entries = min(num_entries, _FLAG_SUBSET_AMOUNT.value)
                split_group = output_file.create_group(split)

            with utils.log_duration(
                    LOGGER, "main",
                    "Creating the output hdf5 file, retrieval."):
                split_group.create_dataset(
                    constants.CTH5Fields.distances,
                    shape=(num_entries, _FLAG_NUM_RETRIEVALS.value),
                    dtype=np.float32,
                )
                split_group.create_dataset(
                    constants.CTH5Fields.gpt2_question_ids_inputs,
                    shape=(num_entries, _FLAG_CONTEXT_SIZE.value),
                    dtype=np.int32)
                if split != "test":
                    split_group.create_dataset(
                        constants.CTH5Fields.gpt2_answer_ids_inputs,
                        shape=(num_entries, _FLAG_CONTEXT_SIZE.value),
                        dtype=np.int32)

                split_group.create_dataset(
                    constants.CTH5Fields.gpt2_retrieved_ids,
                    shape=(
                        num_entries,
                        _FLAG_NUM_RETRIEVALS.value,
                        _FLAG_MAX_LENGTH_RETRIEVALS.value,
                    ),
                    dtype=np.int32)

            with utils.log_duration(LOGGER, "main",
                                    "Loading the reference db."):
                checkpoint_path = os.path.join(
                    retriever_config.query_embedder_path, "encoded",
                    "encoded.ckpt")

                reference_db_device = tf_utils.device_mapping().CPUs[0].name
                with tf.device(reference_db_device):
                    reference_db = tf_utils.load_reference_db(
                        checkpoint_path,
                        variable_name="block_emb",
                    )

        ############################################################################
        # Prep the encoder and the tokenizer
        ############################################################################
        with utils.log_duration(
                LOGGER, "main",
                "Loading the encoder model and the tokenizer."):
            with strategy.scope():
                query_encoder = hub.load(retriever_config.query_embedder_path,
                                         tags={})
            encode_fn = _make_encode_fn(query_encoder)
            encode_fn_strategy_run = _make_encode_fn_strategy_run_fn(
                strategy=strategy,
                encode_fn=encode_fn,
            )

            vocab_file = os.path.join(retriever_config.query_embedder_path,
                                      "assets", "vocab.txt")
            utils.check_exists(vocab_file)
            do_lower_case = query_encoder.signatures["tokenization_info"](
            )["do_lower_case"]
            tokenization_info = dict(vocab_file=vocab_file,
                                     do_lower_case=do_lower_case)

            tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(
                query_encoder, tokenization_info)

        ############################################################################
        # Preprocess the dataset
        ############################################################################

        cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")),
                               tf.int32)
        sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")),
                               tf.int32)
        transform = _make_transform_fn(
            bert_tokenizer=tokenizer,
            bert_cls_token_id=cls_token_id,
            bert_sep_token_id=sep_token_id,
        )

        with utils.log_duration(LOGGER, "main", "generating codes"):
            tqdm_splits = tqdm.tqdm(keys)
            for split in tqdm_splits:
                tqdm_splits.set_description(f"Split `{split}`")
                eli5: Dict[str, datasets.Dataset]
                write_start = 0

                if _FLAG_USE_SUBSET.value:
                    _warn_subset(tqdm_splits)
                    eli5[split] = eli5[split][:_FLAG_SUBSET_AMOUNT.value]
                    utils.check_operator(operator.le, len(eli5[split]["id"]),
                                         _FLAG_SUBSET_AMOUNT.value)
                    utils.check_operator(operator.le,
                                         len(eli5[split]["input"]),
                                         _FLAG_SUBSET_AMOUNT.value)
                else:
                    utils.check_equal(len(eli5[split]), len(eli5[split]["id"]))
                    utils.check_equal(len(eli5[split]),
                                      len(eli5[split]["input"]))

                if split != "test":
                    for_slices = dict(sample_id=eli5[split]["id"],
                                      question=eli5[split]["input"],
                                      answer=[
                                          sample["answer"][0]
                                          for sample in eli5[split]["output"]
                                      ])
                else:
                    for_slices = dict(
                        sample_id=eli5[split]["id"],
                        question=eli5[split]["input"],
                    )

                ds = tf.data.Dataset.from_tensor_slices(for_slices)
                ds = ds.map(transform,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)

                ds = ds.apply(
                    tf.data.experimental.dense_to_ragged_batch(batch_size))
                ds = ds.map(_squeeze,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)

                tqdm_inner = tqdm.tqdm(enumerate(ds),
                                       total=len(eli5[split]["id"]) //
                                       _FLAG_BATCH_SIZE.value,
                                       desc=f"Split `{split}`: Batches")

                for i, batch in tqdm_inner:
                    ######################################################################
                    # Enforce the current real batch size
                    ######################################################################
                    current_batch_size = batch["sample_id"].shape[0]
                    for k, v in batch.items():
                        utils.check_equal(v.shape[0], current_batch_size)
                    ######################################################################

                    gpt2_question_ids_inputs = _prep_field(
                        batch["question"], gpt2_tokenizer)
                    utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32)
                    utils.check_equal(gpt2_question_ids_inputs.shape[0],
                                      current_batch_size)

                    if split != "test":
                        gpt2_answer_ids_inputs = _prep_field(
                            batch["answer"], gpt2_tokenizer)
                        utils.check_equal(gpt2_answer_ids_inputs.dtype,
                                          np.int32)
                        utils.check_equal(gpt2_answer_ids_inputs.shape[0],
                                          current_batch_size)

                        assert len(gpt2_answer_ids_inputs.shape) == 2, (
                            gpt2_answer_ids_inputs.shape)

                    ######################################################################
                    # Save the gpt2 tokenized question and answer
                    ######################################################################
                    end = write_start + current_batch_size

                    utils.check_equal(
                        output_file[split][
                            constants.CTH5Fields.gpt2_question_ids_inputs]
                        [write_start:end].shape[0], current_batch_size)
                    output_file[split][
                        constants.CTH5Fields.gpt2_question_ids_inputs][
                            write_start:end] = gpt2_question_ids_inputs

                    if split != "test":
                        output_file[split][
                            constants.CTH5Fields.gpt2_answer_ids_inputs][
                                write_start:end] = gpt2_answer_ids_inputs

                    ######################################################################
                    # Encode the samples.
                    ######################################################################
                    batch = strategy.experimental_distribute_values_from_function(
                        tf_utils.make_dict_distribute_fn(batch))

                    embeddings = encode_fn_strategy_run(batch)
                    embeddings = tf_utils.process_strat_output(
                        embeddings, "embeddings", strategy, current_batch_size)
                    utils.check_isinstance(embeddings, ops.EagerTensor)
                    utils.check_equal(embeddings.shape[0], current_batch_size)

                    # pytype doesn't seem to see that we check the type
                    utils.check_equal(embeddings.shape[1],
                                      _FLAG_EMBEDDING_DEPTH.value)  # pytype: disable=attribute-error

                    ######################################################################
                    # Retrieve.
                    ######################################################################
                    with tf.device(reference_db_device):
                        top_k, inner_prods = tf_utils.mips_exact_search(
                            embeddings, _FLAG_NUM_RETRIEVALS.value,
                            reference_db)
                    top_k = tf_utils.process_strat_output(
                        top_k, "top_k", strategy, current_batch_size)
                    utils.check_equal(
                        inner_prods.shape,
                        (current_batch_size, _FLAG_NUM_RETRIEVALS.value))
                    utils.check_equal(
                        top_k.shape,
                        (current_batch_size, _FLAG_NUM_RETRIEVALS.value))

                    output_file[split]["distances"][
                        write_start:end] = inner_prods

                    gathered = tf.gather(blocks, top_k).numpy()
                    utils.check_equal(gathered.shape[0], current_batch_size)

                    utils.check_equal(write_start + gathered.shape[0], end)
                    for j in range(gathered.shape[0]):
                        local_gathered = gathered[j].tolist()
                        utils.check_equal(len(local_gathered),
                                          _FLAG_NUM_RETRIEVALS.value)
                        local_gathered = [
                            sample.decode() for sample in local_gathered
                        ]
                        token_ids = np.array(
                            gpt2_tokenizer.batch_encode_plus(
                                local_gathered,
                                padding="max_length",
                                truncation=True,
                            ).input_ids)
                        for line in token_ids:
                            assert not np.all(line == 0), line

                        token_ids[token_ids ==
                                  gpt2_tokenizer.eos_token_id] = -1
                        output_file[split][
                            constants.CTH5Fields.gpt2_retrieved_ids][
                                write_start +
                                j] = token_ids[:, :_FLAG_MAX_LENGTH_RETRIEVALS.
                                               value]

                    write_start += current_batch_size
        ############################################################################
        # Upload the results to GCS
        ############################################################################
        LOGGER.debug("DONE WITH THE PRODUCTION")
        output_file.close()
        with utils.log_duration(LOGGER, "main", "gsutil transfer"):
            command = [
                "/root/google-cloud-sdk/bin/gsutil", "-m", "cp", "-r",
                str(tmp_dir / "*"), target_path
            ]
            LOGGER.debug("Command: %s", " ".join(command))
            subprocess.check_call(command)
        LOGGER.debug("ALL DONE")
예제 #8
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    absl_logging.use_python_logging()
    utils.log_module_args(LOGGER, argv[0])

    utils.check_exists(FLAGS.scann_config_path)
    utils.check_glob_prefix(FLAGS.embeddings_ckpt_path)
    utils.check_exists(FLAGS.output_dir)
    if not tf.io.gfile.isdir(FLAGS.output_dir):
        raise RuntimeError("Output dir needs to be a directory.")

    ##############################################################################
    # Setup: Build the ScaNN (Scam) searcher
    ##############################################################################
    with utils.log_duration(LOGGER, "main", "load_scann_searcher"):
        checkpoint_path = os.path.join(FLAGS.embeddings_ckpt_path)
        # The conversion to a ScannConfig object enforces that all the fields we
        # expect are present in the json file.
        scann_config = retrievers.ScannConfig(
            **utils.from_json_file(FLAGS.scann_config_path))
        block_emb, scann_searcher = scann_utils.load_scann_searcher(
            var_name="block_emb",
            checkpoint_path=checkpoint_path,
            **vars(scann_config))
    utils.check_operator(operator.ge, block_emb.shape[0], FLAGS.test_how_many)

    ##############################################################################
    # Recall Computation
    ##############################################################################
    LOGGER.debug(block_emb.shape)
    utils.check_operator(operator.ge, block_emb.shape[0], FLAGS.test_how_many)
    with utils.log_duration(LOGGER, "main", "all retrievals & comparisons"):
        LOGGER.debug("block_emb.shape: %s", str(block_emb.shape))
        LOGGER.debug("FLAGS.test_how_many: %d", FLAGS.test_how_many)
        all_indices = np.random.choice(block_emb.shape[0],
                                       FLAGS.test_how_many,
                                       replace=False)
        count_total = 0
        count_good = 0
        for i, idx_start in tqdm.tqdm(
                enumerate(range(0, len(all_indices), FLAGS.batch_size))):
            indices = all_indices[idx_start:idx_start + FLAGS.batch_size]
            vectors = tf.gather(block_emb, indices)

            if FLAGS.mode == "all":
                with utils.log_duration(LOGGER, "main", "exact_search"):
                    labels = exact_search(FLAGS.num_neighbors, vectors,
                                          block_emb)
            elif FLAGS.mode == "any":
                labels = tf.cast(tf.expand_dims(indices, -1), tf.int32)
            else:
                raise RuntimeError(FLAGS.mode)

            with utils.log_duration(LOGGER, "main", "scann_search"):
                predictions, _ = scann_searcher.search_batched(vectors)
            good = tf.sets.intersection(labels, predictions)
            count_good += len(good.values)
            count_total += tf.math.reduce_prod(labels.shape)
            ratio = count_good / count_total
            if i % FLAGS.print_every_n_batches == 0 and i != 0:
                LOGGER.debug("Recall so far: %f %%", 100 * ratio)

    final_recall = count_good / count_total
    LOGGER.debug(
        "Final recall for mode `%(mode)s` with `%(num_neighbors)d` "
        "neighbors: %(recall)f %%",
        dict(mode=FLAGS.mode,
             num_neighbors=FLAGS.num_neighbors,
             recall=100 * final_recall))
    LOGGER.debug("%d true positives over %d points.", count_good, count_total)

    ##############################################################################
    # Build the output object and save it.
    ##############################################################################
    output = {}
    output["flags"] = {
        flag.name: flag.value
        for flag in FLAGS.flags_by_module_dict()[argv[0]]
    }
    output["recall"] = float(final_recall)
    # Redundant but easier to read
    output["count_goods"] = int(count_good)
    output["count_total"] = int(count_total)
    output_path = os.path.join(
        FLAGS.output_dir,
        "test_recall_" + time.strftime("results_%Y%m%d-%H%M%S.json"))
    utils.to_json_file(output_path, output)
예제 #9
0
def main(argv):
  #######################################################################
  # Initial Setup. Logging, Flags, Random seeds.
  #######################################################################
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")
  absl_logging.use_python_logging()
  flags_dict = {
      flag.name: flag.value
      for flag in FLAGS.flags_by_module_dict()[argv[0]]
  }

  if FLAGS.use_subset:
    message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}"
               f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET"
               f"{colorama.Style.RESET_ALL}")
    LOGGER.warning(
        message
    )

  utils.log_module_args(LOGGER, argv[0])
  if not FLAGS.output_dir.startswith("gs://"):
    utils.check_exists(FLAG_OUTPUT_DIR.value)
    if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value):
      raise RuntimeError("Output dir needs to be a directory.")

  tf.random.set_seed(FLAG_RANDOM_SEED.value)
  np.random.seed(FLAG_RANDOM_SEED.value)

  # Prepare the instance output directory path and save the config there
  folder_name = time.strftime(
      f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S"
  )
  instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value, folder_name).strip()
  if not instance_output_dir.endswith("/"):
    instance_output_dir += "/"
  json_target = os.path.join(instance_output_dir, "training_params.json")
  if not json_target.strip().startswith("gs://"):
    subprocess.check_call(["mkdir", "-p", instance_output_dir])
  utils.to_json_file(json_target, instance_output_dir)

  ##############################################################################
  # Initialization and Configuration of the Devices.
  ##############################################################################
  tpu_setup = None
  # current_acelerator_type is always "CPU" in the beginning with TPUs
  if tf_utils.current_accelerator_type() == "CPU":
    tpu_setup = tf_utils.init_tpus()

  LOGGER.debug("Devices we are computing on:\n%s",
               utils.wrap_iterable(map(str, tf_utils.devices_to_use())))
  LOGGER.debug("All devices:")
  LOGGER.debug(tf_utils.device_mapping())

  if tf_utils.current_accelerator_type() == "GPU":
    tf.config.set_soft_device_placement(True)

  if tf_utils.current_accelerator_type() != "TPU":
    tf.debugging.set_log_device_placement(True)

  if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES:
    actual_num_replicas = len(tf_utils.devices_to_use())
  elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC:
    actual_num_replicas = FLAG_NUM_REPLICAS.value
  else:
    actual_num_replicas = 1

  ##############################################################################
  # We load the retriever model if it is needed.
  ##############################################################################
  # Not currently used.

  retriever = None
  if (FLAG_APPROACH_TYPE.value ==
      constants.ApproachTypeChoices.lm_and_realm):
    config_path = FLAG_RETRIEVER_CONFIG_PATH.value
    realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path))

    # Approx 15 min when not in dev mode, on CPU
    with utils.log_duration(LOGGER, "main",
                            "whole of BERTScaNNRetriever.__init__",
                            logging.INFO):
      scann_config = retrievers.ScannConfig(
          **utils.from_json_file(FLAG_SCANN_CONFIG_PATH.value))
      retriever = retrievers.BERTScaNNRetriever(
          retriever_module_path=realm_save.query_embedder_path,
          block_records_path=realm_save.text_records,
          num_block_records=realm_save.num_block_records,
          mode=tf.estimator.ModeKeys.EVAL,
          scann_config=scann_config)

  elif (FLAG_APPROACH_TYPE.value ==
        constants.ApproachTypeChoices.cached_realm):
    config_path = FLAG_RETRIEVER_CONFIG_PATH.value
    realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path))

    # Approx 15 min when not in dev mode, on CPU
    with utils.log_duration(LOGGER, "main",
                            "whole of FullyCachedRetriever.__init__",
                            logging.INFO):

      retriever = retrievers.FullyCachedRetriever(
          db_path=FLAG_FULLYCACHED_H5_PATH.value,
          block_records_path=realm_save.text_records,
          num_block_records=realm_save.num_block_records,
          )

  ##############################################################################
  # Distributed training task
  ##############################################################################
  if FLAG_TASK.value == constants.TaskChoices.train:
    with utils.log_duration(LOGGER, "main", "Load model"):
      utils.print_mem("before loading model", LOGGER)
      model_specific = task_specific.load_model(FLAG_MODEL_LOAD_PATH.value,
                                                FLAG_MODEL_KEY.value,
                                                FLAG_DISTRIBUTE_MODE.value,
                                                tpu_setup,
                                                FLAG_NUM_REPLICAS.value)
      utils.print_mem("after loading model", LOGGER)
      model_or_replicas = model_specific.model
      if isinstance(model_or_replicas, list):
        model_or_replicas: List[transformers.TFGPT2LMHeadModel]
      else:
        model_or_replicas: transformers.TFGPT2LMHeadModel

      tokenizer = model_specific.tokenizer

      def make_optimizer():
        return tensor2tensor.utils.adafactor.AdafactorOptimizer(
            learning_rate=FLAG_LEARNING_RATE.value)

      if model_specific.strategy:
        with model_specific.strategy.scope():
          optimizer = make_optimizer()
      else:
        optimizer = make_optimizer()

    ############################################################################
    # Prepare the dataset functions
    ############################################################################
    rg = np.random.default_rng(FLAG_RANDOM_SEED.value)

    def call_lm_preproc(
        repeat,
        split,
        random_seed
    ):
      """Using functools.partial prevents the linter from doing its job."""
      if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5:
        return task_specific.create_lm_ds_kilt_eli5(
            tokenizer=tokenizer,
            context_window_size=(
                model_or_replicas[0].config.n_positions
                if isinstance(model_or_replicas, list)
                else model_or_replicas.config.n_positions
            ),
            dataset_name=FLAG_DATASET_NAME.value,
            # Batches are split over the replicas:
            batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas,
            db_path=FLAG_DB_PATH.value,
            random_seed=random_seed,
            use_subset=FLAG_USE_SUBSET.value,
            subset_size=FLAG_SUBSET_SIZE.value,
            use_helper_words=FLAG_USE_HELPER_WORDS.value,
            approach_type=FLAG_APPROACH_TYPE.value,
            num_retrievals=FLAG_NUM_RETRIEVALS.value,
            retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value,
            retriever=retriever,
            repeat=repeat,
            split=split,
            enable_debug_checks=FLAG_DATASET_DEBUG.value,
            retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value,
            dataset_type=FLAG_DATASET_TYPE.value,
            qty_shuffle=FLAG_QTY_SHUFFLE.value,
            tfr_prefix=FLAG_TFR_PREFIX.value,
            max_length_generation=FLAG_MAX_LENGTH_GENERATION.value,
        )
      else:
        raise NotImplementedError(
            f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`"
        )

    make_training_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial(
        call_lm_preproc,
        split="train",
        repeat=False,
    )
    make_eval_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial(
        call_lm_preproc,
        split="eval",
        repeat=True,
    )

    ############################################################################
    # Prepare the step functions
    ############################################################################
    utils.check_contained(
        FLAG_DISTRIBUTE_MODE.value, constants.DistributeModeChoices.choices()
    )
    tf_function_flags = dict(
        experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value,
        experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value
    )

    if (FLAG_DISTRIBUTE_MODE.value ==
        constants.DistributeModeChoices.split_and_data_parallel):
      if not isinstance(model_or_replicas, list):
        raise RuntimeError(type(model_or_replicas))
      training_step = build_manual_data_parallel_training_step(
          model_or_replicas, optimizer, tf_function_flags
      )

    else:
      training_step = build_regular_training_step(
          model_or_replicas,
          optimizer,
          strategy=model_specific.strategy,
          tf_function_kwargs=tf_function_flags
      )

    evaluation_step = build_evaluation_step(
        model_or_replicas, tf_function_flags
    )

    secs_since_last_ckpt = time.time()
    # Model checkpoints are saved to the tmp_directory and then rsynced to GCS
    ##########################################################################
    # Prepare the different logging facilities
    ##########################################################################
    train_log_dir = os.path.join(instance_output_dir, "tensorboard", "train")
    eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval")
    flags_log_dir = os.path.join(instance_output_dir, "tensorboard", "params")
    writers = dict(
        train=tf.summary.create_file_writer(train_log_dir),
        eval=tf.summary.create_file_writer(eval_log_dir),
        flags=tf.summary.create_file_writer(flags_log_dir)
    )
    with writers["flags"].as_default():
      tf.summary.text(
          "Flags",
          # Tensorboard takes Markdown:
          json.dumps(flags_dict, indent=4).replace("\n", "\n\n"),
          step=0
          )

    ma_loss = dict(
        train=utils.MovingAverage(0.9),
        eval=utils.MovingAverage(0.9)
        )
    step_counters = dict(train=0, eval=0)
    batch_counters = dict(train=0, eval=0)
    prev_batch_end = time.time()

    # The eval ds has no real concept of epoch, repeats forever, shuffling
    # each time it reaches its end
    with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"):
      eval_ds_instance = make_eval_dataset(
          random_seed=rg.integers(-2**63, 2**63 - 1),
      )
    LOGGER.debug("Distributing the eval dataset to the replicas.")
    if FLAG_DATASET_TYPE.value == "tfr":
      eval_ds_instance = (
          model_specific.strategy.experimental_distribute_dataset(
              eval_ds_instance
          )
      )

    LOGGER.debug("Done distributing the eval dataset to the replcias.")
    eval_ds_instance = iter(eval_ds_instance)

    ##########################################################################
    # Training Loop
    ##########################################################################
    for epoch in itertools.count():
      ####################################################################
      # Epoch Setup
      ####################################################################
      LOGGER.debug("EPOCH %d START", epoch)
      # Shuffle differently every epoch
      with utils.log_duration(
          LOGGER, "main", "All of make_training_dataset"
      ):
        train_ds_instance = make_training_dataset(
            random_seed=rg.integers(-2**63, 2**63 - 1),
        )
      LOGGER.debug(
          "Attempting to distribute the training dataset to the replicas."
      )
      if FLAG_DATASET_TYPE.value == "tfr":
        train_ds_instance = (
            model_specific.strategy.experimental_distribute_dataset(
                train_ds_instance
            )
        )

      LOGGER.debug(
          "Done distributing the training dataset to the replicas."
      )
      train_ds_instance = iter(train_ds_instance)

      # This allows us to see if we reached the end of the training iterator,
      # in which case "did_at_least_one_training_batch == False".
      # We could also test that it did all the batches, to similar results.
      did_at_least_one_training_batch = True
      split = "eval"
      while did_at_least_one_training_batch:
        # Invert split
        if split == "train":
          split = "eval"
        else:
          split = "train"

        # Prepare to test if we did at least one training batch
        if split == "train":
          did_at_least_one_training_batch = False

        if split == "train":
          dataset_iterator = itertools.islice(
              train_ds_instance, FLAG_BATCHES_BETWEEN_EVALS.value
          )
        else:
          # The evaluation DS is tiny, so we reshuffle and take a random
          dataset_iterator = itertools.islice(
              eval_ds_instance, FLAG_NUMBER_EVAL_BATCHES.value
          )

        LOGGER.debug("Batching")
        for batch in dataset_iterator:
          # LOGGER.debug("Input sentence:\n\"%s\"",
          #              tokenizer.decode([x for x in batch["input_ids"][0]
          #                                if x != tokenizer.eos_token_id]))
          # LOGGER.debug("Label:\n\"%s\"",
          #              tokenizer.decode([(x if x != -100 else 0)
          #                                for x in batch["label_ids"][0]]))

          if FLAG_DATASET_TYPE.value != "tfr":
            batch = (
                model_specific.strategy
                .experimental_distribute_values_from_function(
                    tf_utils.make_dict_distribute_fn(batch)
                ))

          # We only care about training epochs as, obviously, we don't train
          # over eval samples; the number of  eval samples seen only
          # contributes to lowering the variance in the evaluation of when to
          # do early stopping.
          if split == "train":
            did_at_least_one_training_batch = True

          input_ids = batch["input_ids"]
          label_ids = batch["label_ids"]

          ####################################################################
          # Training Step
          ####################################################################
          step_counters[split] += (
              FLAG_BATCH_SIZE.value * actual_num_replicas
          )

          if split == "train":
            batch_counters[split] += 1
            training_kwargs = dict(
                input_ids=input_ids,
                label_ids=label_ids,
            )

            if model_specific.strategy:
              utils.print_mem("before running", LOGGER)

              LOGGER.debug("Training, Calling strategy.run")
              loss = model_specific.strategy.run(
                  training_step,
                  kwargs=training_kwargs
              )
              LOGGER.debug("Training, Done with strategy.run")
              utils.print_mem("after running", LOGGER)

            else:
              loss = training_step(**training_kwargs)  # pytype: disable=wrong-arg-count
              # If we are in the strategy-free data parallel mode, we need
              # to change the weights of all replicas to those of the model at
              # index 0
              if (
                  FLAG_DISTRIBUTE_MODE.value ==
                  constants.DistributeModeChoices.split_and_data_parallel
              ):
                for replica in model_or_replicas[1:]:
                  replica.set_weights(model_or_replicas[0].get_weights())

          ####################################################################
          # Evaluation Step
          ####################################################################
          elif split == "eval":
            evaluation_kwargs = dict(
                input_ids=input_ids,
                label_ids=label_ids,
            )

            if model_specific.strategy:
              loss = model_specific.strategy.run(
                  evaluation_step,
                  kwargs=evaluation_kwargs
              )
            else:
              loss = evaluation_step(**evaluation_kwargs)
          else:
            raise ValueError(f"Unexpected value for split: {split}")

          ####################################################################
          # Logging
          ####################################################################
          if (FLAG_DISTRIBUTE_MODE.value in
              constants.PURE_DATA_PARALLEL_STRATEGIES):
            utils.check_equal(len(loss.values), actual_num_replicas)
            LOGGER.debug("Split: %s", split)
            LOGGER.debug("Real num replicas: %s", actual_num_replicas)
            LOGGER.debug("Loss: %s", loss)
            LOGGER.debug("Loss values: %s", loss.values)

            average_loss = float(tf.math.reduce_mean(loss.values).numpy())
          else:
            average_loss = float(loss.numpy())

          # tf.debugging.check_numerics(loss)
          now = time.time()
          batch_duration = now - prev_batch_end
          prev_batch_end = now
          ma_loss[split].update(average_loss)

          # Actual logging
          LOGGER.info("Epoch: # %d", epoch)
          LOGGER.info("Tensorboard_dir: %s", instance_output_dir)
          LOGGER.info("Batch: %s # %d", split, batch_counters[split])
          LOGGER.info("Step: %s # %d", split, step_counters[split])
          if FLAG_USE_SUBSET.value:
            LOGGER.warning(">> USING A SUBSET OF THE DATASET <<")
          LOGGER.info(
              "%(split)s Batch loss:           %(metric)f",
              dict(split=split, metric=average_loss)
          )
          LOGGER.info(
              "%(split)s Moving average loss:  %(metric)f",
              dict(split=split, metric=ma_loss[split].average)
          )
          LOGGER.info(
              "%(split)s Moving average ppl:   %(metric)f",
              dict(split=split, metric=np.exp(ma_loss[split].average))
          )
          LOGGER.info(
              "%(split)s Batch duration:       %(duration)s",
              dict(
                  split=split,
                  duration=utils.TimeStamp.from_seconds(
                      batch_duration).format()
              )
          )
          if FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC:
            LOGGER.info(
                "%(split)s Duration per sample:  %(duration)s",
                dict(
                    split=split,
                    duration=utils.TimeStamp.from_seconds(
                        batch_duration / (
                            FLAG_BATCH_SIZE.value * actual_num_replicas
                        )
                    )
                )
            )

          # Write to Tensorboard
          with writers[split].as_default():
            tf.summary.scalar(
                f"Loss/{split}", average_loss, step_counters[split]
            )
            tf.summary.scalar(
                f"PPL/{split}", np.exp(average_loss), step_counters[split]
            )
          writers[split].flush()

          # Save every 5 min
          if (time.time() - secs_since_last_ckpt) / (60 * 20) >= 1:
            secs_since_last_ckpt = time.time()
            save_model(
                train_steps=step_counters["train"],
                model_or_replicas=model_or_replicas,
                instance_output_dir=instance_output_dir
            )

        secs_since_last_ckpt = time.time()
        save_model(
            train_steps=step_counters["train"],
            model_or_replicas=model_or_replicas,
            instance_output_dir=instance_output_dir
        )
    #############################################################
    # Post Training Cleanup
    #######################################################################
    for writer in writers.values():
      writer.close()
예제 #10
0
    os.makedirs(NOBG_DIR)


def removebg(image):
    response = requests.post(
        'https://api.remove.bg/v1.0/removebg',
        files={'image_file': utils.image_to_bytes(image)},
        data={'size': 'regular'},
        headers={'X-Api-Key': API_KEY},
    )

    if response.status_code == requests.codes.ok:
        return Image.open(io.BytesIO(response.content))
    else:
        print("Error:", response.status_code, response.text)


if __name__ == "__main__":
    data = utils.from_json_file(os.path.join(JSON_DIR, "data.json"))
    input = utils.from_json_file(os.path.join(DEBUG_DIR, "input.json"))
    ngbg = []
    for face in input["faces"]:
        cropped = utils.get_cropped_image(data, face, config.FGFI_DIR)
        ngbg.append({
            "info":
            face,
            "image_data":
            base64.b64encode(removebg(cropped, face)).decode("utf-8")
        })
    utils.save_to_json_file(os.path.join(OUTPUT_DIR, "nobg.json"),
                            json.dumps(ngbg))
예제 #11
0
def place_wt(the_shape, the_radial):
    """
    Place count based weighting used to post process the polygon data set
    :param: the_shape
    :param: the_radial
    """
    p_mod = PostProcess(shape=the_shape, radial=the_radial)
    #u_mod = Util(shape=the_shape, radial=the_radial)
    shape_and_size = p_mod.shape +'_' + str(p_mod.radial)
    aust_shape_file_name = 'aust_' + p_mod.shape +'_shape_' + \
                           str(p_mod.radial) +'km'
    gj_name = 'aus_' + shape_and_size + 'km_layer'
    vrt_ref = 'all_' +shape_and_size
    vrt_file = 'all_' + shape_and_size + '.vrt'
    feat_sa1_11 = 'feat_aust_' + str(p_mod.radial) +'km_sa1_11'
    feat_sa1_16 = 'feat_aust_' + str(p_mod.radial) +'km_sa1_16'
    db_name = 'db_place_' + shape_and_size
    tabular_sql_name = 'tabular_place_wt_' + shape_and_size + '.txt'
    output_shape = shape_and_size + 'km_place_11_16'
    p_mod.vrt_shape_and_size('vrt', 'template.vrt', vrt_file)
    p_mod.do_spatialite('table_goes_here.txt', db_name)
    
    datasets = from_json_file('datasets', p_mod.json_files_path, p_mod.slash)

    ref_data = datasets['DataSets']['Australia']['ShapeFormat']
    file_deploy(ref_data)

    ref_data = datasets['DataSets']['StatisticalAreasLevel12011']['ShapeFormat']
    file_deploy(ref_data)

    ref_data = datasets['DataSets']['StatisticalAreasLevel12016']['ShapeFormat']
    file_deploy(ref_data)

    ref_data = datasets['DataSets']['AGILDataset']['CSVFormat']
    file_deploy(ref_data)

    ref_data = datasets['DataSets']['OpenStreetMaps']['ShapeFormat']
    file_deploy(ref_data)
    
    #ref_files_poly_wt('datasets',p_mod.json_files_path, p_mod.slash)

    print('aust_shape')
    p_mod.geojson_to_shp(gj_name, aust_shape_file_name, 4283)
    p_mod.shp_to_db(aust_shape_file_name, db_name, aust_shape_file_name, 4823)

    print('feat_aust_11_area')
    p_mod.sql_to_ogr('feat_aust_11', vrt_ref, feat_sa1_11)
    p_mod.shp_to_db(feat_sa1_11, db_name, feat_sa1_11, 4823)

    print('feat_aust_16_area')
    p_mod.sql_to_ogr('feat_aust_16', vrt_ref, feat_sa1_16)
    p_mod.shp_to_db(feat_sa1_16, db_name, feat_sa1_16, 4823)

    print('tabular_place_wt')
    p_mod.csv_to_db('2011Census_B18_AUST_SA1_long',\
                    db_name, '2011Census_B18_AUST_SA1_long')
    p_mod.csv_to_db('2011Census_B21_AUST_SA1_long',\
                    db_name, '2011Census_B21_AUST_SA1_long')
    p_mod.csv_to_db('2011Census_B22B_AUST_SA1_long',\
                    db_name, '2011Census_B22B_AUST_SA1_long')
    p_mod.csv_to_db('2016Census_G18_AUS_SA1',\
                    db_name, '2016Census_G18_AUS_SA1')
    p_mod.csv_to_db('2016Census_G21_AUS_SA1',\
                    db_name, '2016Census_G21_AUS_SA1')
    p_mod.csv_to_db('2016Census_G22B_AUS_SA1',\
                    db_name, '2016Census_G22B_AUS_SA1')
    file_name = 'aust_{shape}_shape_{size}km'.\
                 format(shape=p_mod.shape,\
                   size=p_mod.radial)
    p_mod.shp_to_db(file_name, db_name, file_name, 4823)
    p_mod.shp_to_db(feat_sa1_11, db_name, feat_sa1_11, 4823)
    p_mod.shp_to_db(feat_sa1_16, db_name, feat_sa1_16, 4823)
    p_mod.shp_to_db('gis_osm_places_free_1',\
                    db_name, 'gis_osm_places_free_1', 4823)
    p_mod.shp_to_db('gis_osm_roads_free_1',\
                    db_name, 'gis_osm_roads_free_1', 4823)
    p_mod.sql_to_ogr('shape_pois_shp', vrt_ref, 'POI')
    p_mod.shp_to_db('POI', db_name, 'POI', 4823)
    p_mod.shape_and_size('spatialite_db', 'tabular_place_wt.txt',\
                         tabular_sql_name)
    p_mod.do_spatialite(tabular_sql_name, db_name)

    print('shape_11_16_place')
    p_mod.sql_to_ogr('shape_11_16_place', vrt_ref, output_shape)
    p_mod.shp_to_geojson(output_shape, output_shape)
    p_mod.shp_to_kml(output_shape, output_shape)