예제 #1
0
    def train(self, data_from_labeled_set: List[tf.Tensor],
              data_from_unlabeled_set: List[tf.Tensor]):

        assert len(data_from_labeled_set) == len(data_from_unlabeled_set)
        y = tf.concat([
            tf.zeros(dtype=tf.int32, shape=[len(data_from_labeled_set)]),
            tf.ones(dtype=tf.int32, shape=[len(data_from_unlabeled_set)])
        ],
                      axis=0)
        x = tf.concat([data_from_labeled_set, data_from_unlabeled_set], axis=0)
        utils.check_equal(len(y), len(x))
        self._model.compile(optimizer="adam",
                            loss="binary_crossentropy",
                            metrics=["accuracy"])

        logging.info("Fitting Model")
        shuffle = np.random.permutation(len(x))

        x: np.ndarray = x.numpy()[shuffle].astype(np.int32)
        y = y.numpy()[shuffle]
        x_tr = x[:int(SPLIT * len(x))]
        x_va = x[int(SPLIT * len(x)):]
        y_tr = y[:int(SPLIT * len(y))]
        y_va = y[int(SPLIT * len(y)):]

        self._model.fit(x=x_tr,
                        y=y_tr,
                        batch_size=self.batch_size,
                        validation_data=(x_va, y_va),
                        verbose=True)

        accuracy = np.mean(self._model.predict(x_va) == y_va)
        logging.info(f"Eval: {accuracy:0.2%}")
        logging.info(f"Done {type(self)}")
예제 #2
0
def _create_float_feature(values, feature_len):
    feature_list = list(values)
    utils.check_equal(len(feature_list), feature_len)

    feature = tf.train.Feature(float_list=tf.train.FloatList(
        value=list(feature_list)))
    return feature
예제 #3
0
def process_strat_output(
    strategy_outputs,
    name,
    strategy,
    current_batch_size,
):
    """Uniformizes the different outputs of strategy.run calls."""
    if isinstance(strategy_outputs, values.PerReplica):
        strategy_outputs: values.PerReplica
        # LOGGER.debug("process_strat_output: %s: %s", name, str(strategy_outputs))
        output = deal_w_entry(strategy_outputs)
        utils.check_equal(output.shape, current_batch_size)
    elif (isinstance(strategy_outputs, tuple)
          and isinstance(strategy_outputs[0], values.PerReplica)):
        strategy_outputs: Tuple[values.PerReplica, Ellipsis]
        output = []
        for indiv_val in strategy_outputs:
            output.append(deal_w_entry(indiv_val))
        output = tuple(output)
    elif (isinstance(strategy_outputs, dict) and isinstance(
            next(iter(strategy_outputs.values())), values.PerReplica)):
        strategy_outputs: Dict[str, values.PerReplica]
        output = {}
        for k, indiv_val in strategy_outputs.items():
            output[k] = deal_w_entry(indiv_val)
    elif isinstance(strategy_outputs, ops.EagerTensor) or (
            isinstance(strategy_outputs, tuple)
            and isinstance(strategy_outputs[0], ops.EagerTensor)):
        output = strategy_outputs
    else:
        raise RuntimeError(
            f"{name}: {type(strategy_outputs)}, {type(strategy)}")

    return output
예제 #4
0
def validate_instance_type_flag():
    # Validate the value:
    instance_tuple = _FLAG_INSTANCE_TYPE.value.strip().split("-")
    utils.check_equal(len(instance_tuple), 3)
    utils.check_contained(instance_tuple[0], {"n1", "n2"})

    utils.check_contained(instance_tuple[1], {"standard", "highmem"})
    num_cpus = int(instance_tuple[2])
    utils.check_operator(operator.le, num_cpus, 64)
    utils.check_operator(operator.ge, num_cpus, 0)
예제 #5
0
def _stack_per_sent(samples_a, samples_b):
    """Extract both of the sentences of a sample, and stack all of them.
    We need both sets of samples because we need to pad the the longuest
    sentence of both sets.

    There are two sentences in a sample. We want to train the filter
    as if they were independent samples. So, we extract the sentences from
    the samples by using the segment_ids. We added a third segment id
    for the padding in order to not get the padding when we filter
    with the segment_ids.
    """

    lengths = []
    packs = []

    for i, samples in enumerate([samples_a, samples_b]):
        # The weird [1:-1] is to remove the <cls> token and the <sep>
        # token from the first sentenceof a sample

        sents_0 = [
            sample["input_ids"][sample["segment_ids"] == 0][1:-1]
            for sample in tqdm.tqdm(samples)
        ]
        sents_1 = [
            sample["input_ids"][sample["segment_ids"] == 1][:-1]
            for sample in tqdm.tqdm(samples)
        ]
        # if i == 1:
        #     for sample in itertools.islice(samples, 0, 100, 10):
        #         logging.info(sample["segment_ids"])

        # itertools.chain just .. chains the iteration over two iterables.
        # like, [x for x in itertools.chain(range(3), range(3))] would be
        # [0, 1, 2, 0, 1, 2]
        length = max(itertools.chain(map(len, sents_0), map(len, sents_1)))
        packs.append((sents_0, sents_1))
        lengths.append(length)

    maxlen = max(lengths)
    output = []

    for pack in tqdm.tqdm(packs):
        sents = [
            tf.pad(sent, [[0, maxlen - len(sent)]])
            for sent in itertools.chain(*pack)
        ]
        output.append(sents)

    utils.check_equal(len(output), 2)
    return tf.stack(output[0]), tf.stack(output[1])
예제 #6
0
def process_strat_output(
    strategy_outputs,
    name,
    strategy,
    current_batch_size,
):
    """Uniformizes the different outputs of strategy.run calls.
  """
    ##############################################################################
    # Single PerReplica
    ##############################################################################
    if isinstance(strategy_outputs, values.PerReplica):
        strategy_outputs: values.PerReplica
        output = to_eager_tensor(strategy_outputs)
        utils.check_equal(output.shape, current_batch_size)

    ##############################################################################
    # Tuple of PerReplicas
    ##############################################################################
    elif (isinstance(strategy_outputs, tuple)
          and isinstance(strategy_outputs[0], values.PerReplica)):
        strategy_outputs: Tuple[values.PerReplica, Ellipsis]
        output = []
        for indiv_val in strategy_outputs:
            output.append(to_eager_tensor(indiv_val))
        output = tuple(output)

    ##############################################################################
    # Dict of PerReplicas
    ##############################################################################
    elif (isinstance(strategy_outputs, dict) and isinstance(
            next(iter(strategy_outputs.values())), values.PerReplica)):
        strategy_outputs: Dict[str, values.PerReplica]
        output = {}
        for k, indiv_val in strategy_outputs.items():
            output[k] = to_eager_tensor(indiv_val)

    ##############################################################################
    # EagerTensor
    ##############################################################################
    elif (isinstance(strategy_outputs, ops.EagerTensor)
          or (isinstance(strategy_outputs, tuple)
              and isinstance(strategy_outputs[0], ops.EagerTensor))):
        output = strategy_outputs
    else:
        raise RuntimeError(
            f"{name}: {type(strategy_outputs)}, {type(strategy)}")

    return output
예제 #7
0
파일: nginx.py 프로젝트: UnknowName/aliops
async def get_domain_attrs(request):
    data = await request.post()
    domain = data.get("domain", "")
    nginx_user, nginxs = config.get_domain_nginxs(domain)
    config_file = config.get_domain(domain).get("config_file", "")
    backend_port = config.get_domain(domain).get("backend_port")
    all_servers = [
        GatewayNGINX(nginx_user, host).get_servers(config_file, backend_port)
        for host in nginxs
    ]
    if not check_equal(all_servers):
        # 网关数据不一样,有可能是因为主机连接失败,打印到终端用于DEBUG
        logger.info("执行获取{0}网关数据,数据不一致: {1}".format(domain, all_servers))
        response = dict(servres=[], status="501", err_msg="出现错误, 网关数据不一致")
    else:
        ok, servers = all_servers.pop()
        if ok and servers:
            response = dict(servers=tuple(servers), status="200", err_msg="")
        else:
            # 如果远程命令失败,servers变量是标准错误输出
            if not servers:
                stderr = "未能获取到后端服务器,请联系管理员确认配置无误"
            else:
                stderr = servers
            logger.info("获取upstreams失败,输出: {0}".format(stderr))
            response = dict(servers=[], status="500", err_msg=str(stderr))
    return web.json_response(response)
예제 #8
0
def create_one_vm_vm():
    runtime = _ONEVM_RUNTIME_VERSION

    if runtime == "v2-alpha":
        utils.check_equal(_FLAG_TPU_QTY.value, "8")

    command = [
        "gcloud",
        "alpha",
        "compute",
        "tpus",
        "tpu-vm",
        "create",
        f"{_FLAG_INSTANCE_NAME.value}",
        f"--zone={_FLAG_ZONE.value}",
        f"--accelerator-type={make_accelerator_type()}",
        f"--version={runtime}",
    ]

    run_gcloud_command(command)
예제 #9
0
    qty_shuffle=1,  # Will never change
    max_length_generation=350
  ), tokenizer, BATCH_SIZE, SPLIT)

num_entries_in_split = (
  task_specific.DATASET_CARDINALITIES["kilt_eli5"][SPLIT]
)
entries_counter = tqdm.tqdm(total=num_entries_in_split)
for batch_no, batch in enumerate(itertools.islice(ds, NUM_ENTRIES)):
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Display the inputs and outputs.
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  rich_console = rich.console.Console(color_system="256")
  print_sample = generation.make_print_sample()

  assert not np.all(batch[0] == batch[1]), batch[0] == batch[1]
  
  with utils.log_duration(
      LOGGER, "main", "all of tokenizer.decode for a batch."
  ):
    for i in range(batch.shape[0]):
      print(f"{batch.shape = }")
      utils.check_equal(len(batch.shape), 2)
      utils.check_equal(batch.shape[0], BATCH_SIZE)
      tokens = batch.numpy()[i]
      input_text = tokenizer.decode(tokens)
      print(f"Batch {batch_no}, Sample {i} / {BATCH_SIZE} of batch:")
      print(f"\tNum tokens: {len(tokens)}")
      print_sample(
        input_text, f"input batch_no {batch_no}", rich_console
      )
예제 #10
0
파일: cis3.py 프로젝트: ysoldak/syco
__email__ = "*****@*****.**"
__credits__ = ["???"]
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"

from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

from utils import x, assert_contains

#
print_header("3. Special Purpose Services")

#
print_header("3.1 Set Daemon umask (Scored)")
check_equal("grep umask /etc/sysconfig/init", "umask 027")

#
print_header("3.2 Remove X Windows (Scored)")
# Original CIS test
# check_equal(
#     'grep "^id:" /etc/inittab',
#     "id:3:initdefault"
# )
# Syco hardened servers use this.
check_equal('grep "^\~\~\:S\:wait\:\/sbin\/sulogin" /etc/inittab',
            "~~:S:wait:/sbin/sulogin")

result = x('yum grouplist "X Window System"')
max_lines = len(result)
assert_contains(result[max_lines - 3], "Available Groups:")
예제 #11
0
def main(argv):
    if len(argv) > 1:
        raise RuntimeError(argv)
    absl_logging.use_python_logging()
    retriever_config = tf_utils.REALMSave(
        **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value))

    extra = "_FROM_SUBSET" if _FLAG_USE_SUBSET.value else ""
    time_stamp = time.strftime("%Y%m%d-%H%M%S")
    target_path = os.path.join(_FLAG_OUTPUT_PATH.value,
                               time_stamp + extra).strip()
    if target_path[-1] != "/":
        target_path += "/"

    ##############################################################################
    # Setup devices and strategy
    ##############################################################################
    with utils.log_duration(LOGGER, "main", "Initializing devices"):
        tpu_config = tf_utils.init_tpus()
        device_type = tf_utils.current_accelerator_type()
        LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use()))

        if device_type == "TPU":
            if tpu_config is None:
                raise RuntimeError("We should have a tpu_config.")
            strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        elif device_type == "GPU" or device_type == "CPU":
            strategy = tf.distribute.MirroredStrategy()
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        else:
            raise RuntimeError(device_type)

    ##############################################################################
    # Load the dataset.
    ##############################################################################
    eli5 = {}
    keys = ["train", "eval", "test"]
    gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

    with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."):
        for split in tqdm.tqdm(keys):
            load_path = os.path.join(_FLAGS_DATASET_ROOT.value,
                                     "HuggingfaceDatasets",
                                     f"{split}_kilt_eli5.hf")
            with tf.device("/job:localhost"):
                eli5[split] = datasets.load_from_disk(load_path)

    if _FLAG_USE_SUBSET.value:
        _warn_subset()

    ##############################################################################
    #
    ##############################################################################
    with utils.log_duration(LOGGER, "Main", "Load the textual dataset"):
        # Extract the appropriate text
        # The buffer_size is taken from the original ORQA code.
        blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records,
                                                 buffer_size=512 * 1024 * 1024)
        blocks_dataset = blocks_dataset.batch(
            retriever_config.num_block_records, drop_remainder=True)
        blocks = tf.data.experimental.get_single_element(blocks_dataset)

    with tempfile.TemporaryDirectory() as tmp_dir:
        ############################################################################
        # Prepare the output file.
        ############################################################################
        tmp_dir = pathlib.Path(tmp_dir)
        h5_output_path = tmp_dir / "codes.h5"
        output_file = h5py.File(h5_output_path, "w")
        flags_dict = {
            flag.name: flag.value
            for flag in flags.FLAGS.flags_by_module_dict()[argv[0]]
        }
        utils.to_json_file(tmp_dir / "params.json", flags_dict)

        for split in keys:
            with utils.log_duration(
                    LOGGER, "main",
                    "Creating the output hdf5 file, embeddings."):
                num_entries = len(eli5[split]["id"])
                if _FLAG_USE_SUBSET.value:
                    num_entries = min(num_entries, _FLAG_SUBSET_AMOUNT.value)
                split_group = output_file.create_group(split)

            with utils.log_duration(
                    LOGGER, "main",
                    "Creating the output hdf5 file, retrieval."):
                split_group.create_dataset(
                    constants.CTH5Fields.distances,
                    shape=(num_entries, _FLAG_NUM_RETRIEVALS.value),
                    dtype=np.float32,
                )
                split_group.create_dataset(
                    constants.CTH5Fields.gpt2_question_ids_inputs,
                    shape=(num_entries, _FLAG_CONTEXT_SIZE.value),
                    dtype=np.int32)
                if split != "test":
                    split_group.create_dataset(
                        constants.CTH5Fields.gpt2_answer_ids_inputs,
                        shape=(num_entries, _FLAG_CONTEXT_SIZE.value),
                        dtype=np.int32)

                split_group.create_dataset(
                    constants.CTH5Fields.gpt2_retrieved_ids,
                    shape=(
                        num_entries,
                        _FLAG_NUM_RETRIEVALS.value,
                        _FLAG_MAX_LENGTH_RETRIEVALS.value,
                    ),
                    dtype=np.int32)

            with utils.log_duration(LOGGER, "main",
                                    "Loading the reference db."):
                checkpoint_path = os.path.join(
                    retriever_config.query_embedder_path, "encoded",
                    "encoded.ckpt")

                reference_db_device = tf_utils.device_mapping().CPUs[0].name
                with tf.device(reference_db_device):
                    reference_db = tf_utils.load_reference_db(
                        checkpoint_path,
                        variable_name="block_emb",
                    )

        ############################################################################
        # Prep the encoder and the tokenizer
        ############################################################################
        with utils.log_duration(
                LOGGER, "main",
                "Loading the encoder model and the tokenizer."):
            with strategy.scope():
                query_encoder = hub.load(retriever_config.query_embedder_path,
                                         tags={})
            encode_fn = _make_encode_fn(query_encoder)
            encode_fn_strategy_run = _make_encode_fn_strategy_run_fn(
                strategy=strategy,
                encode_fn=encode_fn,
            )

            vocab_file = os.path.join(retriever_config.query_embedder_path,
                                      "assets", "vocab.txt")
            utils.check_exists(vocab_file)
            do_lower_case = query_encoder.signatures["tokenization_info"](
            )["do_lower_case"]
            tokenization_info = dict(vocab_file=vocab_file,
                                     do_lower_case=do_lower_case)

            tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(
                query_encoder, tokenization_info)

        ############################################################################
        # Preprocess the dataset
        ############################################################################

        cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")),
                               tf.int32)
        sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")),
                               tf.int32)
        transform = _make_transform_fn(
            bert_tokenizer=tokenizer,
            bert_cls_token_id=cls_token_id,
            bert_sep_token_id=sep_token_id,
        )

        with utils.log_duration(LOGGER, "main", "generating codes"):
            tqdm_splits = tqdm.tqdm(keys)
            for split in tqdm_splits:
                tqdm_splits.set_description(f"Split `{split}`")
                eli5: Dict[str, datasets.Dataset]
                write_start = 0

                if _FLAG_USE_SUBSET.value:
                    _warn_subset(tqdm_splits)
                    eli5[split] = eli5[split][:_FLAG_SUBSET_AMOUNT.value]
                    utils.check_operator(operator.le, len(eli5[split]["id"]),
                                         _FLAG_SUBSET_AMOUNT.value)
                    utils.check_operator(operator.le,
                                         len(eli5[split]["input"]),
                                         _FLAG_SUBSET_AMOUNT.value)
                else:
                    utils.check_equal(len(eli5[split]), len(eli5[split]["id"]))
                    utils.check_equal(len(eli5[split]),
                                      len(eli5[split]["input"]))

                if split != "test":
                    for_slices = dict(sample_id=eli5[split]["id"],
                                      question=eli5[split]["input"],
                                      answer=[
                                          sample["answer"][0]
                                          for sample in eli5[split]["output"]
                                      ])
                else:
                    for_slices = dict(
                        sample_id=eli5[split]["id"],
                        question=eli5[split]["input"],
                    )

                ds = tf.data.Dataset.from_tensor_slices(for_slices)
                ds = ds.map(transform,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)

                ds = ds.apply(
                    tf.data.experimental.dense_to_ragged_batch(batch_size))
                ds = ds.map(_squeeze,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)

                tqdm_inner = tqdm.tqdm(enumerate(ds),
                                       total=len(eli5[split]["id"]) //
                                       _FLAG_BATCH_SIZE.value,
                                       desc=f"Split `{split}`: Batches")

                for i, batch in tqdm_inner:
                    ######################################################################
                    # Enforce the current real batch size
                    ######################################################################
                    current_batch_size = batch["sample_id"].shape[0]
                    for k, v in batch.items():
                        utils.check_equal(v.shape[0], current_batch_size)
                    ######################################################################

                    gpt2_question_ids_inputs = _prep_field(
                        batch["question"], gpt2_tokenizer)
                    utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32)
                    utils.check_equal(gpt2_question_ids_inputs.shape[0],
                                      current_batch_size)

                    if split != "test":
                        gpt2_answer_ids_inputs = _prep_field(
                            batch["answer"], gpt2_tokenizer)
                        utils.check_equal(gpt2_answer_ids_inputs.dtype,
                                          np.int32)
                        utils.check_equal(gpt2_answer_ids_inputs.shape[0],
                                          current_batch_size)

                        assert len(gpt2_answer_ids_inputs.shape) == 2, (
                            gpt2_answer_ids_inputs.shape)

                    ######################################################################
                    # Save the gpt2 tokenized question and answer
                    ######################################################################
                    end = write_start + current_batch_size

                    utils.check_equal(
                        output_file[split][
                            constants.CTH5Fields.gpt2_question_ids_inputs]
                        [write_start:end].shape[0], current_batch_size)
                    output_file[split][
                        constants.CTH5Fields.gpt2_question_ids_inputs][
                            write_start:end] = gpt2_question_ids_inputs

                    if split != "test":
                        output_file[split][
                            constants.CTH5Fields.gpt2_answer_ids_inputs][
                                write_start:end] = gpt2_answer_ids_inputs

                    ######################################################################
                    # Encode the samples.
                    ######################################################################
                    batch = strategy.experimental_distribute_values_from_function(
                        tf_utils.make_dict_distribute_fn(batch))

                    embeddings = encode_fn_strategy_run(batch)
                    embeddings = tf_utils.process_strat_output(
                        embeddings, "embeddings", strategy, current_batch_size)
                    utils.check_isinstance(embeddings, ops.EagerTensor)
                    utils.check_equal(embeddings.shape[0], current_batch_size)

                    # pytype doesn't seem to see that we check the type
                    utils.check_equal(embeddings.shape[1],
                                      _FLAG_EMBEDDING_DEPTH.value)  # pytype: disable=attribute-error

                    ######################################################################
                    # Retrieve.
                    ######################################################################
                    with tf.device(reference_db_device):
                        top_k, inner_prods = tf_utils.mips_exact_search(
                            embeddings, _FLAG_NUM_RETRIEVALS.value,
                            reference_db)
                    top_k = tf_utils.process_strat_output(
                        top_k, "top_k", strategy, current_batch_size)
                    utils.check_equal(
                        inner_prods.shape,
                        (current_batch_size, _FLAG_NUM_RETRIEVALS.value))
                    utils.check_equal(
                        top_k.shape,
                        (current_batch_size, _FLAG_NUM_RETRIEVALS.value))

                    output_file[split]["distances"][
                        write_start:end] = inner_prods

                    gathered = tf.gather(blocks, top_k).numpy()
                    utils.check_equal(gathered.shape[0], current_batch_size)

                    utils.check_equal(write_start + gathered.shape[0], end)
                    for j in range(gathered.shape[0]):
                        local_gathered = gathered[j].tolist()
                        utils.check_equal(len(local_gathered),
                                          _FLAG_NUM_RETRIEVALS.value)
                        local_gathered = [
                            sample.decode() for sample in local_gathered
                        ]
                        token_ids = np.array(
                            gpt2_tokenizer.batch_encode_plus(
                                local_gathered,
                                padding="max_length",
                                truncation=True,
                            ).input_ids)
                        for line in token_ids:
                            assert not np.all(line == 0), line

                        token_ids[token_ids ==
                                  gpt2_tokenizer.eos_token_id] = -1
                        output_file[split][
                            constants.CTH5Fields.gpt2_retrieved_ids][
                                write_start +
                                j] = token_ids[:, :_FLAG_MAX_LENGTH_RETRIEVALS.
                                               value]

                    write_start += current_batch_size
        ############################################################################
        # Upload the results to GCS
        ############################################################################
        LOGGER.debug("DONE WITH THE PRODUCTION")
        output_file.close()
        with utils.log_duration(LOGGER, "main", "gsutil transfer"):
            command = [
                "/root/google-cloud-sdk/bin/gsutil", "-m", "cp", "-r",
                str(tmp_dir / "*"), target_path
            ]
            LOGGER.debug("Command: %s", " ".join(command))
            subprocess.check_call(command)
        LOGGER.debug("ALL DONE")
예제 #12
0
파일: cis1.py 프로젝트: Nemie/syco
    check_return_code,
    print_header,
    view_output,
    print_warning,
    print_info,
)

#
print_header("1. Install Updates, Patches and Additional Security Software")

#
print_header("1.1 Filesystem Configuration")

#
print_header("1.1.1 Create Separate Partition for /tmp (Scored)")
check_equal('grep "[[:space:]]/tmp[[:space:]]" /etc/fstab', "/tmp")

#
print_header("1.1.2 Set nodev option for /tmp Partition (Scored)")
# No tmp partition should have nodev.
check_equal("grep /tmp /etc/fstab", "nodev")
check_equal("mount | grep /tmp", "nodev")

#
print_header("1.1.3 Set nosuid option for /tmp Partition (Scored)")
# No tmp partition should have nosuid.
check_equal("grep /tmp /etc/fstab", "nosuid")
check_equal("mount | grep /tmp", "nosuid")

#
print_header("1.1.4 Set noexec option for /tmp Partition (Scored)")
예제 #13
0
def create_lm_ds_kilt_eli5(
    *,
    tokenizer,
    context_window_size,
    dataset_name,  # pylint: disable=unused-argument
    batch_size,
    split,
    db_path,  # pylint: disable=unused-argument
    random_seed,
    use_subset,  # pylint: disable=unused-argument
    subset_size,  # pylint: disable=unused-argument
    repeat,
    use_helper_words,
    approach_type,
    retriever,
    num_retrievals,
    retrieval_temperature,
    enable_debug_checks,
    retrieval_bank_size,  # pylint: disable=unused-argument
    dataset_type,
    qty_shuffle,
    tfr_prefix,
    max_length_generation,
):
    """Dataset preparation function for the Kilt version of the ELI5 dataset.

  This is for when the dataset is consumed by language models.

  Args:
    tokenizer: Tokenizer of the reader model.
    context_window_size: Size of the context of the reader model.
      Not used here.
    dataset_name: Exact name of the dataset. Some datasets share the same
      function, with small specific differences. Not used here.
    batch_size: Size of the batch for the reader model.
    prefetch_size: How many batches to prefetch.
    split: The train, evaluation or test split.
    dataset_paths_root: Root directory of the datasets. Not used here.
    random_seed: Seed used to shuffle the dataset. Should change at each epoch.
    use_subset: Whether to use a subset of the data
    subset_size: Size of the subset
    repeat: Whether to repeat the dataset
    use_helper_words: Whether to add helper words in the merged samples.
    approach_type: Type of overall solution we are using.
    retriever: Object that does the retrieval.
    num_retrievals: Number of retrievals to do.
    retrieval_temperature: For the retrieval methods that do sampling, what
      temperature to use.
  Returns:
    A tf.data.Dataset object that generates input_ids and label_ids for the
    generator model.
  Raises:
    RuntimeError: If we didn't find any files with the glob pattern.
    RuntimeError: If we are using a dataset type that is not supported.
  """

    maybe_retrieve_and_merge = _make_maybe_retrieve_and_merge_fn(
        tokenizer=tokenizer,
        context_size=context_window_size,
        retriever=retriever,
        temperature=retrieval_temperature,
        num_retrievals=num_retrievals,
        ds_split=split,
        approach_type=approach_type,  # FLAG_APPROACH_TYPE.value
        use_helper_words=use_helper_words,  # FLAG_USE_HELPER_WORDS
        enable_debug_checks=enable_debug_checks,
        max_length_generation=max_length_generation,
    )
    utils.check_equal(dataset_type, constants.DatasetTypeChoices.tfr)
    glob_pattern = os.path.join(tfr_prefix, f"{split}*")
    filenames = list(tf.io.gfile.glob(glob_pattern))
    if not filenames:
        raise RuntimeError(
            f"filnames is empty. Glob pattern was: {glob_pattern}")

    parse = make_parse_fn(split, context_window_size)

    ds = tf.data.TFRecordDataset(
        filenames=filenames,
        num_parallel_reads=tf.data.experimental.AUTOTUNE,
    )

    ds = ds.map(
        parse,
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
        deterministic=False,
    )

    if repeat:
        ds = ds.repeat()

    utils.check_not_none(random_seed)
    utils.check_not_none(qty_shuffle)
    ds = ds.shuffle(qty_shuffle, seed=random_seed)

    ds = ds.batch(
        batch_size,
        drop_remainder=split != constants.SplitChoices.test,
    )

    # We can't use parallel calls here, the huggingface Rust fast tokenizer
    # breaks with multiple threads. It seems to still be worth it over their
    # slow one though, vs using parallel threads.

    ds = ds.map(maybe_retrieve_and_merge)
    # return map(maybe_retrieve_and_merge, ds)
    return ds
예제 #14
0
파일: cis2.py 프로젝트: Nemie/syco
__version__ = "1.0.0"
__status__ = "Production"


from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("2. OS Services")

#
print_header("2.1 Remove Legacy Services")

#
print_header("2.1.1 Remove telnet-server (Scored)")
check_equal(
    "rpm -q telnet-server",
    "package telnet-server is not installed"
)

#
print_header("2.1.2 Remove telnet Clients (Scored)")
check_equal(
    "rpm -q telnet",
    "package telnet is not installed"
)

#
print_header("2.1.3 Remove rsh-server (Scored)")
check_equal(
    "rpm -q rsh-server",
    "package rsh-server is not installed"
)
예제 #15
0
파일: cis3.py 프로젝트: Nemie/syco
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"


from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

from utils import x, assert_contains

#
print_header("3. Special Purpose Services")

#
print_header("3.1 Set Daemon umask (Scored)")
check_equal(
    "grep umask /etc/sysconfig/init",
    "umask 027"
)

#
print_header("3.2 Remove X Windows (Scored)")
# Original CIS test
# check_equal(
#     'grep "^id:" /etc/inittab',
#     "id:3:initdefault"
# )
# Syco hardened servers use this.
check_equal(
    'grep "^\~\~\:S\:wait\:\/sbin\/sulogin" /etc/inittab',
    "~~:S:wait:/sbin/sulogin"
)
예제 #16
0
def main(argv):
    #######################################################################
    # Initial Setup. Logging, Flags, Random seeds.
    #######################################################################
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")
    absl_logging.use_python_logging()
    flags_dict = {
        flag.name: flag.value
        for flag in FLAGS.flags_by_module_dict()[argv[0]]
    }

    if FLAGS.use_subset:
        message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}"
                   f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET"
                   f"{colorama.Style.RESET_ALL}")
        LOGGER.warning(message)

    utils.log_module_args(LOGGER, argv[0])
    if not FLAGS.output_dir.startswith("gs://"):
        utils.check_exists(FLAG_OUTPUT_DIR.value)
        if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value):
            raise RuntimeError("Output dir needs to be a directory.")

    tf.random.set_seed(FLAG_RANDOM_SEED.value)
    np.random.seed(FLAG_RANDOM_SEED.value)

    # Prepare the instance output directory path and save the config there
    folder_name = time.strftime(
        f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S")
    instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value,
                                       folder_name).strip()
    if not instance_output_dir.endswith("/"):
        instance_output_dir += "/"
    json_target = os.path.join(instance_output_dir, "training_params.json")
    if not json_target.strip().startswith("gs://"):
        subprocess.check_call(["mkdir", "-p", instance_output_dir])
    utils.to_json_file(json_target, instance_output_dir)

    ##############################################################################
    # Initialization and Configuration of the Devices.
    ##############################################################################
    tpu_setup = None
    # current_acelerator_type is always "CPU" in the beginning with TPUs
    if tf_utils.current_accelerator_type() == "CPU":
        tpu_setup = tf_utils.init_tpus()

    LOGGER.debug("Devices we are computing on:\n%s",
                 utils.wrap_iterable(map(str, tf_utils.devices_to_use())))
    LOGGER.debug("All devices:")
    LOGGER.debug(tf_utils.device_mapping())

    if tf_utils.current_accelerator_type() == "GPU":
        tf.config.set_soft_device_placement(True)

    if tf_utils.current_accelerator_type() != "TPU":
        tf.debugging.set_log_device_placement(True)

    if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES:
        actual_num_replicas = len(tf_utils.devices_to_use())
    elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC:
        actual_num_replicas = FLAG_NUM_REPLICAS.value
    else:
        actual_num_replicas = 1

    ##############################################################################
    # We load the retriever model if it is needed.
    ##############################################################################
    # Not currently used.

    retriever = None
    # if (FLAG_APPROACH_TYPE.value ==
    #     constants.ApproachTypeChoices.lm_and_realm):
    #   raise NotImplementedError("This part needs to be tested anew.")
    # config_path = FLAG_RETRIEVER_CONFIG_PATH.value
    # realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path))
    #
    # # Approx 15 min when not in dev mode, on CPU
    # with utils.log_duration(LOGGER, "main",
    #                         "whole of BERTScaNNRetriever.__init__",
    #                         logging.INFO):
    #   scann_config = retrievers.ScannConfig(
    #       **utils.from_json_file(FLAG_SCANN_CONFIG_PATH.value))
    #   retriever = retrievers.BERTScaNNRetriever(
    #       retriever_module_path=realm_save.query_embedder_path,
    #       block_records_path=realm_save.text_records,
    #       num_block_records=realm_save.num_block_records,
    #       mode=tf.estimator.ModeKeys.EVAL,
    #       scann_config=scann_config)

    # elif (FLAG_APPROACH_TYPE.value ==
    #       constants.ApproachTypeChoices.cached_realm):
    #   raise NotImplementedError("This part needs to be tested anew.")
    # config_path = FLAG_RETRIEVER_CONFIG_PATH.value
    # realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path))
    #
    # # Approx 15 min when not in dev mode, on CPU
    # with utils.log_duration(LOGGER, "main",
    #                         "whole of FullyCachedRetriever.__init__",
    #                         logging.INFO):
    #
    #   retriever = retrievers.FullyCachedRetriever(
    #       db_path=FLAG_FULLYCACHED_H5_PATH.value,
    #       block_records_path=realm_save.text_records,
    #       num_block_records=realm_save.num_block_records,
    #       )

    ##############################################################################
    # Distributed training task
    ##############################################################################
    if FLAG_TASK.value == constants.TaskChoices.train:
        with utils.log_duration(LOGGER, "main", "Load model"):
            utils.print_mem("before loading model", LOGGER)
            model_specific = task_specific.load_model(
                FLAG_MODEL_LOAD_PATH.value, FLAG_MODEL_KEY.value,
                FLAG_DISTRIBUTE_MODE.value, tpu_setup, FLAG_NUM_REPLICAS.value)
            utils.print_mem("after loading model", LOGGER)
            model_or_replicas = model_specific.model
            if isinstance(model_or_replicas, list):
                model_or_replicas: List[transformers.TFGPT2LMHeadModel]
            else:
                model_or_replicas: transformers.TFGPT2LMHeadModel

            tokenizer = model_specific.tokenizer

            def make_optimizer():
                return tensor2tensor.utils.adafactor.AdafactorOptimizer(
                    learning_rate=FLAG_LEARNING_RATE.value)

            if model_specific.strategy:
                with model_specific.strategy.scope():
                    optimizer = make_optimizer()
            else:
                optimizer = make_optimizer()

        ############################################################################
        # Prepare the dataset functions
        ############################################################################
        rg = np.random.default_rng(FLAG_RANDOM_SEED.value)

        def call_lm_preproc(repeat, split, random_seed):
            """Using functools.partial prevents the linter from doing its job."""
            if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5:
                return task_specific.create_lm_ds_kilt_eli5(
                    tokenizer=tokenizer,
                    context_window_size=(
                        model_or_replicas[0].config.n_positions if isinstance(
                            model_or_replicas,
                            list) else model_or_replicas.config.n_positions),
                    dataset_name=FLAG_DATASET_NAME.value,
                    # Batches are split over the replicas:
                    batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas,
                    db_path=FLAG_DB_PATH.value,
                    random_seed=random_seed,
                    use_subset=FLAG_USE_SUBSET.value,
                    subset_size=FLAG_SUBSET_SIZE.value,
                    use_helper_words=FLAG_USE_HELPER_WORDS.value,
                    approach_type=FLAG_APPROACH_TYPE.value,
                    num_retrievals=FLAG_NUM_RETRIEVALS.value,
                    retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value,
                    retriever=retriever,
                    repeat=repeat,
                    split=split,
                    enable_debug_checks=FLAG_DATASET_DEBUG.value,
                    retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value,
                    dataset_type=FLAG_DATASET_TYPE.value,
                    qty_shuffle=FLAG_QTY_SHUFFLE.value,
                    tfr_prefix=FLAG_TFR_PREFIX.value,
                    max_length_generation=FLAG_MAX_LENGTH_GENERATION.value,
                )
            else:
                raise NotImplementedError(
                    f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`"
                )

        make_training_dataset: Callable[Ellipsis,
                                        tf.data.Dataset] = functools.partial(
                                            call_lm_preproc,
                                            split="train",
                                            repeat=False,
                                        )
        make_eval_dataset: Callable[Ellipsis,
                                    tf.data.Dataset] = functools.partial(
                                        call_lm_preproc,
                                        split="eval",
                                        repeat=True,
                                    )

        ############################################################################
        # Prepare the step functions
        ############################################################################
        utils.check_contained(FLAG_DISTRIBUTE_MODE.value,
                              constants.DistributeModeChoices.choices())
        tf_function_flags = dict(
            experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value,
            experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value)

        if (FLAG_DISTRIBUTE_MODE.value ==
                constants.DistributeModeChoices.split_and_data_parallel):
            if not isinstance(model_or_replicas, list):
                raise RuntimeError(type(model_or_replicas))
            training_step = build_manual_data_parallel_training_step(
                model_or_replicas, optimizer, tf_function_flags)

        else:
            training_step = build_regular_training_step(
                model_or_replicas,
                optimizer,
                strategy=model_specific.strategy,
                tf_function_kwargs=tf_function_flags)

        evaluation_step = build_evaluation_step(model_or_replicas,
                                                tf_function_flags)

        secs_since_last_ckpt = time.time()
        # Model checkpoints are saved to the tmp_directory and then rsynced to GCS
        ##########################################################################
        # Prepare the different logging facilities
        ##########################################################################
        train_log_dir = os.path.join(instance_output_dir, "tensorboard",
                                     "train")
        eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval")
        flags_log_dir = os.path.join(instance_output_dir, "tensorboard",
                                     "params")
        writers = dict(train=tf.summary.create_file_writer(train_log_dir),
                       eval=tf.summary.create_file_writer(eval_log_dir),
                       flags=tf.summary.create_file_writer(flags_log_dir))
        with writers["flags"].as_default():
            tf.summary.text(
                "Flags",
                # Tensorboard takes Markdown:
                json.dumps(flags_dict, indent=4).replace("\n", "\n\n"),
                step=0)

        ma_loss = dict(train=utils.MovingAverage(0.9),
                       eval=utils.MovingAverage(0.9))
        step_counters = dict(train=0, eval=0)
        batch_counters = dict(train=0, eval=0)
        prev_batch_end = time.time()

        # The eval ds has no real concept of epoch, repeats forever, shuffling
        # each time it reaches its end
        with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"):
            eval_ds_instance = make_eval_dataset(random_seed=rg.integers(
                -2**63, 2**63 - 1), )
        LOGGER.debug("Distributing the eval dataset to the replicas.")
        if FLAG_DATASET_TYPE.value == "tfr":
            eval_ds_instance = (
                model_specific.strategy.experimental_distribute_dataset(
                    eval_ds_instance))

        LOGGER.debug("Done distributing the eval dataset to the replcias.")
        eval_ds_instance = iter(eval_ds_instance)

        ##########################################################################
        # Training Loop
        ##########################################################################
        for epoch in itertools.count():
            ####################################################################
            # Epoch Setup
            ####################################################################
            LOGGER.debug("EPOCH %d START", epoch)
            # Shuffle differently every epoch
            with utils.log_duration(LOGGER, "main",
                                    "All of make_training_dataset"):
                train_ds_instance = make_training_dataset(
                    random_seed=rg.integers(-2**63, 2**63 - 1), )
            LOGGER.debug(
                "Attempting to distribute the training dataset to the replicas."
            )
            if FLAG_DATASET_TYPE.value == "tfr":
                train_ds_instance = (
                    model_specific.strategy.experimental_distribute_dataset(
                        train_ds_instance))

            LOGGER.debug(
                "Done distributing the training dataset to the replicas.")
            train_ds_instance = iter(train_ds_instance)

            # This allows us to see if we reached the end of the training iterator,
            # in which case "did_at_least_one_training_batch == False".
            # We could also test that it did all the batches, to similar results.
            did_at_least_one_training_batch = True
            split = "eval"
            while did_at_least_one_training_batch:
                # Invert split
                if split == "train":
                    split = "eval"
                else:
                    split = "train"

                # Prepare to test if we did at least one training batch
                if split == "train":
                    did_at_least_one_training_batch = False

                if split == "train":
                    dataset_iterator = itertools.islice(
                        train_ds_instance, FLAG_BATCHES_BETWEEN_EVALS.value)
                else:
                    # The evaluation DS is tiny, so we reshuffle and take a random
                    dataset_iterator = itertools.islice(
                        eval_ds_instance, FLAG_NUMBER_EVAL_BATCHES.value)

                LOGGER.debug("Batching")
                for batch in dataset_iterator:
                    # LOGGER.debug("Input sentence:\n\"%s\"",
                    #              tokenizer.decode([x for x in batch["input_ids"][0]
                    #                                if x != tokenizer.eos_token_id]))
                    # LOGGER.debug("Label:\n\"%s\"",
                    #              tokenizer.decode([(x if x != -100 else 0)
                    #                                for x in batch["label_ids"][0]]))

                    if FLAG_DATASET_TYPE.value != "tfr":
                        batch = (model_specific.strategy.
                                 experimental_distribute_values_from_function(
                                     tf_utils.make_dict_distribute_fn(batch)))

                    # We only care about training epochs as, obviously, we don't train
                    # over eval samples; the number of  eval samples seen only
                    # contributes to lowering the variance in the evaluation of when to
                    # do early stopping.
                    if split == "train":
                        did_at_least_one_training_batch = True

                    input_ids = batch["input_ids"]
                    label_ids = batch["label_ids"]

                    ####################################################################
                    # Training Step
                    ####################################################################
                    step_counters[split] += (FLAG_BATCH_SIZE.value *
                                             actual_num_replicas)

                    if split == "train":
                        batch_counters[split] += 1
                        training_kwargs = dict(
                            input_ids=input_ids,
                            label_ids=label_ids,
                        )

                        if model_specific.strategy:
                            utils.print_mem("before running", LOGGER)

                            LOGGER.debug("Training, Calling strategy.run")
                            loss = model_specific.strategy.run(
                                training_step, kwargs=training_kwargs)
                            LOGGER.debug("Training, Done with strategy.run")
                            utils.print_mem("after running", LOGGER)

                        else:
                            loss = training_step(**training_kwargs)  # pytype: disable=wrong-arg-count
                            # If we are in the strategy-free data parallel mode, we need
                            # to change the weights of all replicas to those of the model at
                            # index 0
                            if (FLAG_DISTRIBUTE_MODE.value ==
                                    constants.DistributeModeChoices.
                                    split_and_data_parallel):
                                for replica in model_or_replicas[1:]:
                                    replica.set_weights(
                                        model_or_replicas[0].get_weights())

                    ####################################################################
                    # Evaluation Step
                    ####################################################################
                    elif split == "eval":
                        evaluation_kwargs = dict(
                            input_ids=input_ids,
                            label_ids=label_ids,
                        )

                        if model_specific.strategy:
                            loss = model_specific.strategy.run(
                                evaluation_step, kwargs=evaluation_kwargs)
                        else:
                            loss = evaluation_step(**evaluation_kwargs)
                    else:
                        raise ValueError(
                            f"Unexpected value for split: {split}")

                    ####################################################################
                    # Logging
                    ####################################################################
                    if (FLAG_DISTRIBUTE_MODE.value
                            in constants.PURE_DATA_PARALLEL_STRATEGIES):
                        utils.check_equal(len(loss.values),
                                          actual_num_replicas)
                        LOGGER.debug("Split: %s", split)
                        LOGGER.debug("Real num replicas: %s",
                                     actual_num_replicas)
                        LOGGER.debug("Loss: %s", loss)
                        LOGGER.debug("Loss values: %s", loss.values)

                        average_loss = float(
                            tf.math.reduce_mean(loss.values).numpy())
                    else:
                        average_loss = float(loss.numpy())

                    # tf.debugging.check_numerics(loss)
                    now = time.time()
                    batch_duration = now - prev_batch_end
                    prev_batch_end = now
                    ma_loss[split].update(average_loss)

                    # Actual logging
                    LOGGER.info("Epoch: # %d", epoch)
                    LOGGER.info("Tensorboard_dir: %s", instance_output_dir)
                    LOGGER.info("Batch: %s # %d", split, batch_counters[split])
                    LOGGER.info("Step: %s # %d", split, step_counters[split])
                    if FLAG_USE_SUBSET.value:
                        LOGGER.warning(">> USING A SUBSET OF THE DATASET <<")
                    LOGGER.info("%(split)s Batch loss:           %(metric)f",
                                dict(split=split, metric=average_loss))
                    LOGGER.info(
                        "%(split)s Moving average loss:  %(metric)f",
                        dict(split=split, metric=ma_loss[split].average))
                    LOGGER.info(
                        "%(split)s Moving average ppl:   %(metric)f",
                        dict(split=split,
                             metric=np.exp(ma_loss[split].average)))
                    LOGGER.info(
                        "%(split)s Batch duration:       %(duration)s",
                        dict(split=split,
                             duration=utils.TimeStamp.from_seconds(
                                 batch_duration).format()))
                    if FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC:
                        LOGGER.info(
                            "%(split)s Duration per sample:  %(duration)s",
                            dict(split=split,
                                 duration=utils.TimeStamp.from_seconds(
                                     batch_duration / (FLAG_BATCH_SIZE.value *
                                                       actual_num_replicas))))

                    # Write to Tensorboard
                    with writers[split].as_default():
                        tf.summary.scalar(f"Loss/{split}", average_loss,
                                          step_counters[split])
                        tf.summary.scalar(f"PPL/{split}", np.exp(average_loss),
                                          step_counters[split])
                    writers[split].flush()

                    # Save every 5 min
                    if (time.time() - secs_since_last_ckpt) / (60 * 20) >= 1:
                        secs_since_last_ckpt = time.time()
                        save_model(train_steps=step_counters["train"],
                                   model_or_replicas=model_or_replicas,
                                   instance_output_dir=instance_output_dir)

                secs_since_last_ckpt = time.time()
                save_model(train_steps=step_counters["train"],
                           model_or_replicas=model_or_replicas,
                           instance_output_dir=instance_output_dir)
        #############################################################
        # Post Training Cleanup
        #######################################################################
        for writer in writers.values():
            writer.close()
예제 #17
0
def generate_textid_corpus(args: argparse.Namespace) -> None:
    """
    Read raw files (in specified directory), parse and filter, then output
    the Bert token-ids for all files to another directory

    :param args: ArgumentParser-parsed arguments
    :return: None
    """

    if not args.mode in VALID_MODES:
        raise ValueError(f"The argument 'mode' needs to be one of "
                         f"{VALID_MODES}, got {args.mode}.")

    if platform.system() == "Darwin" and args.mode in MODES_NEEDING_BLINGFIRE:
        raise Exception(
            f"Got a mode requiring Blingfire (mode = {args.mode}), "
            "yet Blingfire doesn't support Macos.")

    if not blingfire:
        # If we aren't using blingfire, then we must use spacy
        # for sentence segmentation.
        try:
            spacy_model = spacy.load("en_core_web_sm")
        except OSError:
            print()
            print("Exception:")
            print("Didn't find the model for spacy.")
            print("Run 'python -m spacy download en_core_web_sm'")
            exit(-1)

    # Get list of input file paths
    in_list = sorted(glob.glob(os.path.join(args.input_dir, "*.txt")))
    if args.max_number_of_books:
        in_list = in_list[:args.max_number_of_books]

        logging.warning(
            f"{colorama.Fore.RED}>>> USING A MAX NUMBER OF BOOKS <<<"
            f"{colorama.Style.RESET_ALL}")

    # Load blingfire textid model
    if args.mode == "blingfire" and platform.system() == "Darwin":
        raise Exception("BlingFire is not compatible with MacOS.")

    idtok_model = None
    if blingfire and args.mode in MODES_NEEDING_BLINGFIRE:
        model_path = os.path.join(args.textid_dir, args.base_tok_file)
        utils.check_file_exists(model_path)
        idtok_model = blingfire.load_model(model_path)

    utils.check_file_exists(args.vocab_path)
    bert_full_tokenizer = tokenization.FullTokenizer(vocab_file=str(
        args.vocab_path),
                                                     do_lower_case=False)

    if args.mode == "check":
        with open(args.vocab_path) as fin:
            ids_to_words = fin.read().strip().split("\n")
            words_to_ids = {i: word for i, word in enumerate(ids_to_words)}

    # Iterate through each raw file
    if args.mode != "blingfire":
        print("WARNING: We aren't in a mode that doesn't "
              f"exclusively use Blingfire. Will be slow.\nMode: {args.mode}")

    logging.info(f"Main Loop - {args.mode}")
    for i, in_file_path in enumerate(tqdm.tqdm(in_list)):
        # Generate output file path
        file_basename = os.path.splitext(os.path.basename(in_file_path))[0]
        out_file_path = os.path.join(args.output_dir, file_basename)

        # Read file chunk by chunk
        with open(in_file_path) as in_file:
            # We read the whole file, then cut to CHUNK_MAX_LEN characters long.
            # This seems like a more resistant way to guarantee that we
            # correctly get full sentences.
            # The length of the chunks at 100k is the longuest that doesn't
            # break spacy's sentence tokenizer.
            logging.debug("Loading a file >")
            file_text = in_file.read().strip()
            if not file_text:
                continue

            logging.debug("< Done loading a file")

            for i in range(len(file_text) // CHUNK_MAX_LEN):
                logging.debug("Chunking. >")
                chunk = file_text[i * CHUNK_MAX_LEN:(i + 1) * CHUNK_MAX_LEN]
                # Get the blingfire-processed sentences from this chunk
                # (NOTE: maybe redundant, look into it maybe removing if slow)
                sent_tok_start = time.time()
                logging.debug("< Done chunking.")

                logging.debug("Segmentizing sentence. >")
                if blingfire:
                    sentences = chunk_to_sentences(chunk)
                else:
                    sentences = [str(x) for x in spacy_model(chunk).sents]
                # Ignore the first and last sentences, as they've
                # likely been cut weirdly by the chunking process.
                # We loose less than 1/1000th of all sentences by doing this.
                # (with a CHUNK_MAX_LEN of 100k).
                logging.debug(f"Number of sentences: {len(sentences)}")
                sentences = sentences[1:-1]

                logging.debug(f"< Done segmentizing sentence. It took "
                              f"{time.time() - sent_tok_start} seconds.")
                # Additional filtering for plaintext sentences
                filter_time_start = time.time()
                logging.debug("Filtering sentences >")
                ft_sentences = filter_sentences(sentences)
                logging.debug(f"< Done filtering sentences. It took "
                              f"{time.time() - filter_time_start} seconds.")

                # Convert each sentence to their textid
                bpe_tok_time_start = time.time()
                logging.debug("Tokenizing sentences >")

                curr_ids = utils.TypedList(np.ndarray)
                for ft_sent in ft_sentences:
                    ids = None
                    if blingfire:
                        ids = blingfire.text_to_ids(idtok_model, ft_sent,
                                                    args.id_seq_length,
                                                    args.oov_id)

                    if args.mode == "bert-native" or args.mode == "check":
                        bert_tokens = bert_full_tokenizer.tokenize(ft_sent)
                        bert_tok_ids = bert_full_tokenizer.convert_tokens_to_ids(
                            bert_tokens)

                        bert_tok_ids_ = utils.TypedList(int)
                        for x in bert_tok_ids:
                            bert_tok_ids_.append(x)
                        bert_tok_ids = bert_tok_ids_

                        while len(bert_tok_ids) < args.id_seq_length:
                            bert_tok_ids.append(0)

                        bert_tok_ids = np.array(
                            list(bert_tok_ids),
                            dtype=np.int32)[:args.id_seq_length]

                        if args.mode == "bert-native":
                            ids = bert_tok_ids

                    if args.mode == "check":
                        # In the "check" mode, we test that both the
                        # bert native tokenizer and blingfire return
                        # the same thing.

                        utils.check_equal(ids.shape, bert_tok_ids.shape)
                        comp = ids == bert_tok_ids

                        if not np.all(comp):

                            def bert_decode(ids):
                                return " ".join(
                                    ids_to_words[wid] for wid in ids
                                    if wid != 0)  #.replace(" ##", "")

                            # print("Blingfire ids:")
                            # print(ids)
                            print(
                                "\n################################################"
                            )
                            print("Mismatch between decoders:")
                            print(
                                f"\t Blingfire decoded: \"{bert_decode(ids)}\""
                            )
                            print(
                                f"\t- Bert-native decoded: \"{bert_decode(bert_tok_ids)}\""
                            )
                            print(
                                "################################################\n"
                            )
                            # print("Bert-native tokenizer ids:")
                            # print(bert_tok_ids)

                            num_errors = np.sum(np.logical_not(comp))
                            out_of = max(np.sum(ids != 0),
                                         np.sum(bert_tok_ids != 0))

                            if num_errors / out_of >= 1:
                                raise ValueError(f"{num_errors} "
                                                 f"different out of {out_of} "
                                                 f"non padding values")

                    curr_ids.append(ids)

                logging.debug(f"< Done tokenizing sentences. It took "
                              f"{time.time() - bpe_tok_time_start} seconds.")

                concat_time_start = time.time()
                logging.debug("Concatenating the ids. >")

                if not curr_ids:
                    logging.warning(">> Warning: empty cur_file_ids")

                id_mat = np.array(list(curr_ids), dtype=np.int32)

                logging.debug(f"< Done Concatenating the ids. Took "
                              f"{time.time() - concat_time_start} seconds.")
                if len(id_mat) == 0:
                    logging.warn(
                        f"We got an id_mat of size 0.\nFile index = {i}."
                        f"\nBook file path = {in_file_path}.")
                logging.debug("Saving >")
                path = pathlib.Path(out_file_path)
                np.save(path.parent / (f"{i}_" + str(path.name)), id_mat)
                logging.debug("< Done saving.")

    # Free model
    if blingfire:
        blingfire.free_model(idtok_model)
예제 #18
0
파일: cis5.py 프로젝트: Nemie/syco
print_header("5 Logging and Auditing")

#
print_header("5.1 Configure Syslog")

#
print_header("5.1.1 Install the rsyslog package (Scored)")
check_equal_re(
    "rpm -q rsyslog",
    "rsyslog.*"
)

#
print_header("5.1.2 Activate the rsyslog Service (Scored)")
check_equal(
    "rpm -q syslog",
    "package syslog is not installed"
)
check_empty("chkconfig --list | grep syslog")
check_equal_re(
    "chkconfig --list rsyslog",
    "rsyslog.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off"
)

#
print_header("5.1.3 Configure /etc/rsyslog.conf (Not Scored)")
print_warning("Manually review the contents of the /etc/rsyslog.conf file to ensure appropriate logging is set. ")
view_output("ls -l /var/log/")

#
print_header("5.1.4 Create and Set Permissions on rsyslog Log Files (Scored)")
print_header(" TODO - Ensure that the log files are logging information")
예제 #19
0
def make_accelerator_type() -> str:
    utils.check_equal(_FLAG_TPU_TYPE.value, "v3")
    utils.check_equal(_FLAG_TPU_QTY.value, "8")
    assert not _FLAG_PREEMPTIBLE_TPU.value, _FLAG_PREEMPTIBLE_TPU.value
    return f"{_FLAG_TPU_TYPE.value}-{_FLAG_TPU_QTY.value}"
예제 #20
0
파일: cis4.py 프로젝트: Nemie/syco
__version__ = "1.0.0"
__status__ = "Production"


from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("4 Network Configuration and Firewalls")

#
print_header("4.1 Modify Network Parameters (Host Only)")

#
print_header("4.1.1 Disable IP Forwarding (Scored)")
check_equal(
    "/sbin/sysctl net.ipv4.ip_forward",
    "net.ipv4.ip_forward = 0"
)

#
print_header("4.1.2 Disable Send Packet Redirects (Scored)")
check_equal(
    "/sbin/sysctl net.ipv4.conf.all.send_redirects",
    "net.ipv4.conf.all.send_redirects = 0"
)
check_equal(
    "/sbin/sysctl net.ipv4.conf.default.send_redirects",
    "net.ipv4.conf.default.send_redirects = 0"
)

#
print_header("4.2 Modify Network Parameters (Host and Router)")
예제 #21
0
def main(argv):
    # Arguments and logging boilerplate
    if len(argv) > 1:
        raise RuntimeError(argv)

    absl_logging.use_python_logging()
    utils.log_module_args(LOGGER, argv[0])

    # Load a retriever config.
    retriever_config = tf_utils.REALMConfig(
        **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value))
    assert not _FLAG_USE_SUBSET.value

    # Preparation of the output path
    time_stamp = time.strftime("%Y%m%d-%H%M%S")
    target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip())
    if target_path[-1] != "/":
        target_path += "/"

    ##############################################################################
    # Setup devices and strategy
    ##############################################################################
    # Duration is pretty much instantaneous
    with utils.log_duration(LOGGER, "main", "Initializing devices"):
        tpu_config = tf_utils.init_tpus(local=_FLAG_TPU_IS_LOCAL.value,
                                        tpu_name=_FLAG_TPU_NAME.value)
        device_type = tf_utils.current_accelerator_type()
        LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use()))
        if _FLAG_TPU_NAME.value and device_type == "CPU":
            raise RuntimeError("Device is CPU and we expected a TPU.")

        if device_type == "TPU":
            if tpu_config is None:
                raise RuntimeError("We should have a tpu_config.")
            strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        elif device_type == "GPU" or device_type == "CPU":
            strategy = tf.distribute.MirroredStrategy()
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        else:
            raise RuntimeError(device_type)

    ##############################################################################
    # Load the KILT ELI5 dataset.
    ##############################################################################
    # Takes a while
    eli5 = {}
    keys = ["train", "validation", "test"]
    gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

    with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."):
        if _FLAG_DATASET_ROOT.value:
            for split in tqdm.tqdm(keys):
                load_path = os.path.join(_FLAG_DATASET_ROOT.value,
                                         "HuggingfaceDatasets",
                                         f"{split}_kilt_eli5.hf")
                with tf.device("/job:localhost"):
                    eli5[split] = datasets.load_from_disk(load_path)
        else:
            eli5 = datasets.load_dataset("kilt_tasks", "eli5")

    ##############################################################################
    # Load the dataset of the text that will be retrieved.
    ##############################################################################
    # Takes a long time
    with utils.log_duration(LOGGER, "Main", "Load the textual dataset"):
        # Extract the appropriate text
        # The buffer_size is taken from the original ORQA code.
        blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records,
                                                 buffer_size=512 * 1024 * 1024)
        blocks_dataset = blocks_dataset.batch(
            retriever_config.num_block_records, drop_remainder=False)
        blocks: tf.Tensor = tf.data.experimental.get_single_element(
            blocks_dataset)

    ############################################################################
    # Increase the number of maximum open file descriptors to make space
    # for all the shards.
    ############################################################################
    max_num_fd = _FLAG_NUM_SHARDS.value * 3 + _MIN_N_FD
    resource.setrlimit(resource.RLIMIT_NOFILE, (max_num_fd, max_num_fd))

    ############################################################################
    # Prepare the output files.
    ############################################################################
    writers = {}
    all_paths = {}

    for split in keys:
        maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else ""
        # Prepare paths. They can't be in a generator. A function generator would be
        # fine though.
        paths = [
            os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr")
            for i in range(_FLAG_NUM_SHARDS.value)
        ]
        all_paths[split] = paths
        writers[split] = []

        # Create The TFR writers.
        for i, path in enumerate(paths):
            writers[split].append(tf.io.TFRecordWriter(path))

    # Load the reference DB. We used to accidentally do this once per split :O
    with utils.log_duration(LOGGER, "main", "Loading the reference db."):
        checkpoint_path = os.path.join(retriever_config.query_embedder_path,
                                       "encoded", "encoded.ckpt")
        reference_db_device = tf_utils.device_mapping().CPUs[0].name
        with tf.device(reference_db_device):
            reference_db = tf_utils.load_reference_db(
                checkpoint_path,
                variable_name="block_emb",
            )

    ############################################################################
    # Prep the encoder and the tokenizer
    ############################################################################
    with utils.log_duration(LOGGER, "main",
                            "Loading the encoder model and the tokenizer."):
        with strategy.scope():
            query_encoder = hub.load(retriever_config.query_embedder_path,
                                     tags={})
        encode_fn = _make_encode_fn(query_encoder)
        encode_fn_strategy_run = make_encode_fn_strategy_run_fn(
            strategy=strategy,
            encode_fn=encode_fn,
        )

        vocab_file = os.path.join(retriever_config.query_embedder_path,
                                  "assets", "vocab.txt")
        utils.check_exists(vocab_file)
        do_lower_case = query_encoder.signatures["tokenization_info"](
        )["do_lower_case"]
        tokenization_info = dict(vocab_file=vocab_file,
                                 do_lower_case=do_lower_case)

        tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(
            query_encoder, tokenization_info)

    ############################################################################
    # Preprocess the dataset
    ############################################################################
    cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")),
                           tf.int32)
    sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")),
                           tf.int32)
    transform = _make_transform_fn(
        bert_tokenizer=tokenizer,
        bert_cls_token_id=cls_token_id,
        bert_sep_token_id=sep_token_id,
    )

    feature_dtypes = {
        constants.CTH5Fields.distances: tf.float32,
        constants.CTH5Fields.gpt2_retrieved_ids: tf.int32,
        constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32,
        constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32,
    }

    with utils.log_duration(LOGGER, "main", "generating codes"):
        for split in keys:
            sample_count = 0
            eli5: Dict[str, datasets.Dataset]

            if split != "test":
                for_slices = dict(sample_id=eli5[split]["id"],
                                  question=eli5[split]["input"],
                                  answer=[
                                      sample[0]["answer"]
                                      for sample in eli5[split]["output"]
                                  ])
            else:
                for_slices = dict(
                    sample_id=eli5[split]["id"],
                    question=eli5[split]["input"],
                )

            ds = tf.data.Dataset.from_tensor_slices(for_slices)
            ds = ds.map(transform,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            ds = ds.apply(
                tf.data.experimental.dense_to_ragged_batch(batch_size))
            ds = ds.map(_squeeze,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            tqdm_inner = tqdm.tqdm(enumerate(ds),
                                   total=len(eli5[split]["id"]) //
                                   _FLAG_BATCH_SIZE.value,
                                   desc=f"Split `{split}`: Batches")

            for i, batch in tqdm_inner:
                features = collections.defaultdict(list)

                ######################################################################
                # Enforce the current real batch size
                ######################################################################
                current_batch_size = batch["sample_id"].shape[0]
                for k, v in batch.items():
                    utils.check_equal(v.shape[0], current_batch_size)
                ######################################################################

                gpt2_question_ids_inputs = _prep_field(batch["question"],
                                                       gpt2_tokenizer)
                utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32)
                utils.check_equal(gpt2_question_ids_inputs.shape[0],
                                  current_batch_size)

                if split != "test":
                    gpt2_answer_ids_inputs = _prep_field(
                        batch["answer"], gpt2_tokenizer)
                    utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32)
                    utils.check_equal(gpt2_answer_ids_inputs.shape[0],
                                      current_batch_size)

                    assert len(gpt2_answer_ids_inputs.shape) == 2, (
                        gpt2_answer_ids_inputs.shape)

                ######################################################################
                # Save the gpt2 tokenized question and answer
                ######################################################################

                features[constants.CTH5Fields.gpt2_question_ids_inputs].extend(
                    gpt2_question_ids_inputs)

                if split != "test":
                    features[
                        constants.CTH5Fields.gpt2_answer_ids_inputs].extend(
                            gpt2_answer_ids_inputs)

                ######################################################################
                # Encode the samples.
                ######################################################################
                batch = strategy.experimental_distribute_values_from_function(
                    tf_utils.make_dict_distribute_fn(batch))

                embeddings = encode_fn_strategy_run(batch)
                embeddings = tf_utils.process_strat_output(
                    embeddings, "embeddings", strategy, current_batch_size)
                utils.check_isinstance(embeddings, ops.EagerTensor)
                utils.check_equal(embeddings.shape[0], current_batch_size)

                # pytype doesn't seem to see that we check the type
                utils.check_equal(embeddings.shape[1],
                                  _FLAG_EMBEDDING_DEPTH.value)  # pytype: disable=attribute-error

                ######################################################################
                # Retrieve.
                ######################################################################
                # Do exact retrieval
                with tf.device(reference_db_device):
                    top_k, inner_prods = tf_utils.mips_exact_search(
                        embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db)

                # Collate the results
                top_k = tf_utils.process_strat_output(top_k, "top_k", strategy,
                                                      current_batch_size)

                # Check the shapes
                utils.check_equal(
                    inner_prods.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))
                utils.check_equal(
                    top_k.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))

                # Save the distances
                features[constants.CTH5Fields.distances].extend(inner_prods)

                # Retrieve the text fields associated to the indices
                gathered = tf.gather(blocks, top_k).numpy()
                utils.check_equal(gathered.shape[0], current_batch_size)
                utils.check_equal(gathered.shape[1],
                                  _FLAG_NUM_RETRIEVALS.value)

                retrievals = []
                for index_in_batch in range(current_batch_size):
                    # Put the appropriate byte strings in a list
                    local_gathered = gathered[index_in_batch].tolist()
                    utils.check_equal(len(local_gathered),
                                      _FLAG_NUM_RETRIEVALS.value)
                    # Decode to utf-8
                    local_gathered = [
                        sample.decode() for sample in local_gathered
                    ]
                    # Encode to GPT2 BPE
                    token_ids = np.array(
                        gpt2_tokenizer.batch_encode_plus(
                            local_gathered,
                            padding="max_length",
                            truncation=True,
                        ).input_ids)

                    # Make sure no line is empty
                    # TODO(julesgm): Maybe optional
                    for line in token_ids:
                        assert not np.all(line == 0), line

                    # Convert the eos_tokens
                    token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1

                    # Save the retrievals
                    retrievals.append(token_ids)

                # Save the feature
                features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals

                utils.check_equal(
                    retrievals[0].shape,
                    (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value))

                for k, v in features.items():
                    utils.check_equal(len(v), current_batch_size)

                for index_in_batch in range(current_batch_size):
                    feature_dict = {}
                    for feature_k, feature_v in features.items():
                        # Cast the feature to its appropriate dtype
                        casted_feats = tf.cast(feature_v[index_in_batch],
                                               feature_dtypes[feature_k])
                        # Serialize the tensor to bytes
                        feature_bytes = tf.io.serialize_tensor(casted_feats)
                        # Build a bytes list tf.train.Feature object,
                        # the serialization tree node
                        feature_dict[feature_k] = _bytes_feature(feature_bytes)

                    # Create the serialization tree root
                    # Expects a list of features
                    feature = tf.train.Features(feature=feature_dict)
                    # Expects a tf.train.Features object
                    example_obj = tf.train.Example(features=feature)

                    # Serialize that to bytes
                    serialized_example = example_obj.SerializeToString()

                    # Write the bytes
                    # TODO(julesgm): Parallelize this with a thread or a process pool &
                    #   futures.
                    writers[split][sample_count %
                                   _FLAG_NUM_SHARDS.value].write(
                                       serialized_example)
                    sample_count += 1

                if sample_count % 1000 == 0:
                    LOGGER.debug("Paths: %s", str(all_paths[split][0]))

            LOGGER.debug("Flushing and closing the `%s` writers", split)
            for writer in tqdm.tqdm(writers[split]):
                writer.flush()
                writer.close()

    LOGGER.debug("Done.")
예제 #22
0
__credits__ = ["???"]
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"

from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("1. Install Updates, Patches and Additional Security Software")

#
print_header("1.1 Filesystem Configuration")

#
print_header("1.1.1 Create Separate Partition for /tmp (Scored)")
check_equal('grep "[[:space:]]/tmp[[:space:]]" /etc/fstab', '/tmp')

#
print_header("1.1.2 Set nodev option for /tmp Partition (Scored)")
# No tmp partition should have nodev.
check_equal("grep /tmp /etc/fstab", "nodev")
check_equal("mount | grep /tmp", "nodev")

#
print_header("1.1.3 Set nosuid option for /tmp Partition (Scored)")
# No tmp partition should have nosuid.
check_equal("grep /tmp /etc/fstab", "nosuid")
check_equal("mount | grep /tmp", "nosuid")

#
print_header("1.1.4 Set noexec option for /tmp Partition (Scored)")
예제 #23
0
def _create_int_feature(values, feature_len):
    feature_list = list(values)
    utils.check_equal(len(feature_list), feature_len)
    feature = tf.train.Feature(int64_list=tf.train.Int64List(
        value=feature_list))
    return feature
예제 #24
0
파일: cis2.py 프로젝트: ysoldak/syco
__credits__ = ["???"]
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"

from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("2. OS Services")

#
print_header("2.1 Remove Legacy Services")

#
print_header("2.1.1 Remove telnet-server (Scored)")
check_equal("rpm -q telnet-server", "package telnet-server is not installed")

#
print_header("2.1.2 Remove telnet Clients (Scored)")
check_equal("rpm -q telnet", "package telnet is not installed")

#
print_header("2.1.3 Remove rsh-server (Scored)")
check_equal("rpm -q rsh-server", "package rsh-server is not installed")

#
print_header("2.1.4 Remove rsh (Scored)")
check_equal("rpm -q rsh", "package rsh is not installed")

#
print_header("2.1.5 Remove NIS Client (Scored)")
def main(argv):
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    absl_logging.use_python_logging()
    utils.log_module_args(LOGGER, argv[0])

    # Some checks for the flags
    utils.check_exists(FLAGS.source_text_path)
    utils.check_exists(os.path.dirname(FLAGS.subset_text_path))
    utils.check_exists(os.path.dirname(FLAGS.subset_embeddings_ds_path))
    utils.check_operator(operator.lt, FLAGS.subset_total, FLAGS.source_total)

    utils.check_glob_prefix(FLAGS.source_embeddings_prefix)

    # Select a random subset
    with utils.log_duration(LOGGER, "main", "preparing indices"):
        indices = np.random.choice(FLAGS.source_total,
                                   FLAGS.subset_total,
                                   replace=False)
        indices.sort()

    # Process the textual data
    # Much (5 min vs 2 h) faster than iterating through the records and writing
    # only those we want. An hypothesis for this is that
    # get_single_element would allow to get elements without parsing all of the
    # elements along the way, like simply iterating through the records would.
    # Or did they get constant time indexing in TFRecords?
    # Inspired by the ORQA codebase:
    # https://github.com/google-research/language/blob/master/language/orqa/models/orqa_model.py#L147
    with utils.log_duration(LOGGER, "main", "preparing data"):
        text_ds = tf.data.TFRecordDataset(FLAGS.source_text_path,
                                          buffer_size=512 * 1024 * 1024,
                                          num_parallel_reads=os.cpu_count())
        text_ds = text_ds.batch(FLAGS.source_total)
        text_ds = tf.data.experimental.get_single_element(text_ds)
        subset = tf.gather(text_ds, tf.constant(indices))

    with utils.log_duration(LOGGER, "main", "writing text data"):
        with tf.io.TFRecordWriter(FLAGS.subset_text_path) as text_writer:
            for text in tqdm.tqdm(subset, total=FLAGS.subset_total):
                text = text.numpy()
                # REALM's data uses no packaging of the data into features, etc.
                text_writer.write(text)

    with utils.log_duration(LOGGER, "main", "All of the embedding task"):
        # Process the embeddings data
        with tf.device("/cpu:0"):
            with utils.log_duration(LOGGER, "main", "Loading the checkpoint"):
                embs = tf.train.load_checkpoint(
                    FLAGS.source_embeddings_prefix).get_tensor("block_emb")
                utils.check_equal(embs.shape[0], FLAGS.source_total)

            with utils.log_duration(LOGGER, "main",
                                    "taking a subset of the indices"):
                subset = embs[indices]

            tf_db = tf.Variable(subset, shape=subset.shape)
            ckpt = tf.train.Checkpoint(block_emb=tf_db)

            with utils.log_duration(LOGGER, "main", "Saving the checkpoint"):
                ckpt.save(FLAGS.subset_embeddings_ds_path)

        LOGGER.debug("Done")
예제 #26
0
파일: cis6.py 프로젝트: Nemie/syco
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"


from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("6 System Access, Authentication and Authorization")

#
print_header("6.1 Configure cron and anacron")

#
print_header("6.1.1 Enable anacron Daemon (Scored)")
check_equal("rpm -q anacron", "package anacron is not installed")
print_info("Not installed syco servers.")

print_header("6.1.2 Enable crond Daemon (Scored)")
check_equal_re(
    "chkconfig --list crond",
    "crond.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off"
)

#
print_header("6.1.3 Set User/Group Owner and Permission on /etc/anacrontab (Scored)")
check_equal('stat -c "%a %u %g" /etc/anacrontab | egrep "600 0 0"', "600 0 0")

#
print_header("6.1.4 Set User/Group Owner and Permission on /etc/crontab (Scored)")
check_equal('stat -c "%a %u %g" /etc/crontab | egrep "600 0 0"', "600 0 0")
def main(argv):
    if len(argv) > 1:
        raise RuntimeError(argv)
    absl_logging.use_python_logging()
    utils.log_module_args(LOGGER, argv[0])

    retriever_config = tf_utils.REALMSave(
        **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value))
    assert not _FLAG_USE_SUBSET.value

    time_stamp = time.strftime("%Y%m%d-%H%M%S")
    target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip())
    if target_path[-1] != "/":
        target_path += "/"

    ##############################################################################
    # Setup devices and strategy
    ##############################################################################
    with utils.log_duration(LOGGER, "main", "Initializing devices"):
        tpu_config = tf_utils.init_tpus()
        device_type = tf_utils.current_accelerator_type()
        LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use()))

        if device_type == "TPU":
            if tpu_config is None:
                raise RuntimeError("We should have a tpu_config.")
            strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        elif device_type == "GPU" or device_type == "CPU":
            strategy = tf.distribute.MirroredStrategy()
            batch_size = len(
                tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value
        else:
            raise RuntimeError(device_type)

    ##############################################################################
    # Load the dataset.
    ##############################################################################
    eli5 = {}
    keys = ["train", "eval", "test"]
    gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

    with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."):
        for split in tqdm.tqdm(keys):
            load_path = os.path.join(_FLAG_DATASET_ROOT.value,
                                     "HuggingfaceDatasets",
                                     f"{split}_kilt_eli5.hf")
            with tf.device("/job:localhost"):
                eli5[split] = datasets.load_from_disk(load_path)

    ##############################################################################
    #
    ##############################################################################
    with utils.log_duration(LOGGER, "Main", "Load the textual dataset"):
        # Extract the appropriate text
        # The buffer_size is taken from the original ORQA code.
        blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records,
                                                 buffer_size=512 * 1024 * 1024)
        blocks_dataset = blocks_dataset.batch(
            retriever_config.num_block_records, drop_remainder=True)
        blocks = tf.data.experimental.get_single_element(blocks_dataset)

    ############################################################################
    # Prepare the output file.
    ############################################################################
    writers = {}

    all_paths = {}
    for split in keys:
        maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else ""
        paths = [
            os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr")
            for i in range(_FLAG_NUM_SHARDS.value)
        ]
        all_paths[split] = paths
        writers[split] = [tf.io.TFRecordWriter(filename) for filename in paths]

        with utils.log_duration(LOGGER, "main", "Loading the reference db."):
            checkpoint_path = os.path.join(
                retriever_config.query_embedder_path, "encoded",
                "encoded.ckpt")

            reference_db_device = tf_utils.device_mapping().CPUs[0].name
            with tf.device(reference_db_device):
                reference_db = tf_utils.load_reference_db(
                    checkpoint_path,
                    variable_name="block_emb",
                )

    ############################################################################
    # Prep the encoder and the tokenizer
    ############################################################################
    with utils.log_duration(LOGGER, "main",
                            "Loading the encoder model and the tokenizer."):
        with strategy.scope():
            query_encoder = hub.load(retriever_config.query_embedder_path,
                                     tags={})
        encode_fn = _make_encode_fn(query_encoder)
        encode_fn_strategy_run = make_encode_fn_strategy_run_fn(
            strategy=strategy,
            encode_fn=encode_fn,
        )

        vocab_file = os.path.join(retriever_config.query_embedder_path,
                                  "assets", "vocab.txt")
        utils.check_exists(vocab_file)
        do_lower_case = query_encoder.signatures["tokenization_info"](
        )["do_lower_case"]
        tokenization_info = dict(vocab_file=vocab_file,
                                 do_lower_case=do_lower_case)

        tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(
            query_encoder, tokenization_info)

    ############################################################################
    # Preprocess the dataset
    ############################################################################
    cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")),
                           tf.int32)
    sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")),
                           tf.int32)
    transform = _make_transform_fn(
        bert_tokenizer=tokenizer,
        bert_cls_token_id=cls_token_id,
        bert_sep_token_id=sep_token_id,
    )

    feature_dtypes = {
        constants.CTH5Fields.distances: tf.float32,
        constants.CTH5Fields.gpt2_retrieved_ids: tf.int32,
        constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32,
        constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32,
    }

    with utils.log_duration(LOGGER, "main", "generating codes"):
        for split in keys:
            sample_count = 0
            eli5: Dict[str, datasets.Dataset]

            if split != "test":
                for_slices = dict(sample_id=eli5[split]["id"],
                                  question=eli5[split]["input"],
                                  answer=[
                                      sample["answer"][0]
                                      for sample in eli5[split]["output"]
                                  ])
            else:
                for_slices = dict(
                    sample_id=eli5[split]["id"],
                    question=eli5[split]["input"],
                )

            ds = tf.data.Dataset.from_tensor_slices(for_slices)
            ds = ds.map(transform,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            ds = ds.apply(
                tf.data.experimental.dense_to_ragged_batch(batch_size))
            ds = ds.map(_squeeze,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)

            tqdm_inner = tqdm.tqdm(enumerate(ds),
                                   total=len(eli5[split]["id"]) //
                                   _FLAG_BATCH_SIZE.value,
                                   desc=f"Split `{split}`: Batches")

            for i, batch in tqdm_inner:
                features = collections.defaultdict(list)

                ######################################################################
                # Enforce the current real batch size
                ######################################################################
                current_batch_size = batch["sample_id"].shape[0]
                for k, v in batch.items():
                    utils.check_equal(v.shape[0], current_batch_size)
                ######################################################################

                gpt2_question_ids_inputs = _prep_field(batch["question"],
                                                       gpt2_tokenizer)
                utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32)
                utils.check_equal(gpt2_question_ids_inputs.shape[0],
                                  current_batch_size)

                if split != "test":
                    gpt2_answer_ids_inputs = _prep_field(
                        batch["answer"], gpt2_tokenizer)
                    utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32)
                    utils.check_equal(gpt2_answer_ids_inputs.shape[0],
                                      current_batch_size)

                    assert len(gpt2_answer_ids_inputs.shape) == 2, (
                        gpt2_answer_ids_inputs.shape)

                ######################################################################
                # Save the gpt2 tokenized question and answer
                ######################################################################

                features[constants.CTH5Fields.gpt2_question_ids_inputs].extend(
                    gpt2_question_ids_inputs)

                if split != "test":
                    features[
                        constants.CTH5Fields.gpt2_answer_ids_inputs].extend(
                            gpt2_answer_ids_inputs)

                ######################################################################
                # Encode the samples.
                ######################################################################
                batch = strategy.experimental_distribute_values_from_function(
                    tf_utils.make_dict_distribute_fn(batch))

                embeddings = encode_fn_strategy_run(batch)
                embeddings = tf_utils.process_strat_output(
                    embeddings, "embeddings", strategy, current_batch_size)
                utils.check_isinstance(embeddings, ops.EagerTensor)
                utils.check_equal(embeddings.shape[0], current_batch_size)

                # pytype doesn't seem to see that we check the type
                utils.check_equal(embeddings.shape[1],
                                  _FLAG_EMBEDDING_DEPTH.value)  # pytype: disable=attribute-error

                ######################################################################
                # Retrieve.
                ######################################################################
                with tf.device(reference_db_device):
                    top_k, inner_prods = tf_utils.mips_exact_search(
                        embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db)
                top_k = tf_utils.process_strat_output(top_k, "top_k", strategy,
                                                      current_batch_size)
                utils.check_equal(
                    inner_prods.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))
                utils.check_equal(
                    top_k.shape,
                    (current_batch_size, _FLAG_NUM_RETRIEVALS.value))

                features[constants.CTH5Fields.distances].extend(inner_prods)

                gathered = tf.gather(blocks, top_k).numpy()
                utils.check_equal(gathered.shape[0], current_batch_size)
                retrievals = []
                for j in range(gathered.shape[0]):
                    local_gathered = gathered[j].tolist()
                    utils.check_equal(len(local_gathered),
                                      _FLAG_NUM_RETRIEVALS.value)
                    local_gathered = [
                        sample.decode() for sample in local_gathered
                    ]
                    token_ids = np.array(
                        gpt2_tokenizer.batch_encode_plus(
                            local_gathered,
                            padding="max_length",
                            truncation=True,
                        ).input_ids)
                    for line in token_ids:
                        assert not np.all(line == 0), line

                    token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1
                    retrievals.append(token_ids)
                features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals

                utils.check_equal(
                    retrievals[0].shape,
                    (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value))

                for k, v in features.items():
                    utils.check_equal(len(v), current_batch_size)

                for k in range(current_batch_size):
                    feature = tf.train.Features(
                        feature={
                            k: _bytes_feature(
                                tf.io.serialize_tensor(
                                    tf.cast(v[k], feature_dtypes[k])))
                            for k, v in features.items()
                        })

                    writers[split][
                        sample_count % _FLAG_NUM_SHARDS.value].write(
                            tf.train.Example(
                                features=feature).SerializeToString())
                    sample_count += 1
                if sample_count % 1000 == 0:
                    LOGGER.debug("Paths: %s", str(all_paths[split][0]))

    LOGGER.debug("Done.")
예제 #28
0
파일: cis4.py 프로젝트: ysoldak/syco
__credits__ = ["???"]
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"

from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("4 Network Configuration and Firewalls")

#
print_header("4.1 Modify Network Parameters (Host Only)")

#
print_header("4.1.1 Disable IP Forwarding (Scored)")
check_equal("/sbin/sysctl net.ipv4.ip_forward", "net.ipv4.ip_forward = 0")

#
print_header("4.1.2 Disable Send Packet Redirects (Scored)")
check_equal("/sbin/sysctl net.ipv4.conf.all.send_redirects",
            "net.ipv4.conf.all.send_redirects = 0")
check_equal("/sbin/sysctl net.ipv4.conf.default.send_redirects",
            "net.ipv4.conf.default.send_redirects = 0")

#
print_header("4.2 Modify Network Parameters (Host and Router)")

#
print_header("4.2.1 Disable Source Routed Packet Acceptance (Scored)")
check_equal("/sbin/sysctl net.ipv4.conf.all.accept_source_route",
            "net.ipv4.conf.all.accept_source_route = 0")
예제 #29
0
def main(argv):
    if len(argv) > 1:
        raise RuntimeError(argv[1:])
    absl_logging.use_python_logging()
    utils.check_contained(_FLAG_APPROACH_TYPE.value, _ACCEPTABLE_APPROACHES)
    db_path = _FLAG_DB_PATH.value
    model_path = _FLAG_MODEL_PATH.value
    tpu_config = tf_utils.init_tpus()
    device_type = tf_utils.devices_to_use()[0].device_type
    if device_type == "TPU":
        assert isinstance(tpu_config, tf_utils.TpuConfigType)
        strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
    elif device_type == "GPU" or "CPU":
        # MirroredStrategy automatically becomes OneDeviceStrategy if there is
        # just one device, like one GPU or only CPUs.
        strategy = tf.distribute.MirroredStrategy()
    else:
        raise RuntimeError()

    ##############################################################################
    # Load Model
    ##############################################################################
    with utils.log_duration(LOGGER, main.__name__, "All of model preparation"):

        def make_model_tf(path):
            with utils.log_duration(LOGGER, make_model_tf.__name__,
                                    "Load model."):
                if os.path.exists(path):
                    config_path = os.path.join(path, "config.json")
                    model_path = os.path.join(path, "tf_model.h5")
                    utils.check_exists(config_path)
                    utils.check_exists(model_path)
                    config = transformers.GPT2Config.from_pretrained(
                        config_path)
                    return transformers.TFGPT2LMHeadModel.from_pretrained(
                        model_path, config=config)
                else:
                    return transformers.TFGPT2LMHeadModel.from_pretrained(
                        path, )

        with strategy.scope():
            if model_path.startswith("gs://"):
                with utils.log_duration(LOGGER, main.__name__,
                                        "Download model from GS"):
                    with tempfile.TemporaryDirectory() as td:
                        td += os.path.sep

                        if os.path.exists("/root/google-cloud-sdk/bin/gsutil"):
                            exec_ = "/root/google-cloud-sdk/bin/gsutil"
                        else:
                            exec_ = "gsutil"

                        command = [
                            exec_,
                            "-m",
                            "cp",
                            "-r",
                            os.path.join(model_path, "*"),
                            td,
                        ]
                        LOGGER.debug("Running bash command: %s",
                                     " ".join(command))
                        subprocess.check_call(command)
                        LOGGER.debug("Files at the temp dir(%s): %s", td,
                                     str(os.listdir(td)))

                        model = make_model_tf(td)
            else:
                model = make_model_tf(model_path)

            model.__call__ = tf.function(
                model.__call__,
                experimental_relax_shapes=True,
                experimental_compile=True,
            )

    ##############################################################################
    # Load Dataset Pipeline
    ##############################################################################

    utils.check_contained(
        _FLAG_APPROACH_TYPE.value, {
            constants.ApproachTypeChoices.naked_lm,
            constants.ApproachTypeChoices.naked_lm
        })
    devices = tf_utils.devices_to_use()
    num_replicas = len(devices) if devices[0].device_type in {"GPU", "TPU"
                                                              } else 1
    # Only a batch size of 1 is currently supported. We need attention masks
    utils.check_equal(_FLAG_BATCH_SIZE.value, 1)
    batch_size = _FLAG_BATCH_SIZE.value * num_replicas
    approach_type = _FLAG_APPROACH_TYPE.value

    # Things that will never change:
    random_seed = 0
    use_helper_words = True
    retrieval_temperature = 1
    context_window_size = 1024

    logging.debug("Loading dataset.")
    tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    ds = task_specific.create_lm_ds_kilt_eli5(
        tokenizer=tokenizer,
        context_window_size=context_window_size,
        dataset_name="kilt_eli5",
        batch_size=1,  # >> We set our own batch size elsewhere
        db_path=db_path,
        random_seed=random_seed,
        use_subset=False,
        subset_size=-1,
        use_helper_words=use_helper_words,
        approach_type=approach_type,
        num_retrievals=5,  # Will never change
        retrieval_temperature=retrieval_temperature,
        retriever=None,  # Cached retrievals don't need a retriever
        repeat=False,  # Will never change
        split=_FLAG_SPLIT.value,
        enable_debug_checks=False,
        retrieval_bank_size=5,  # Will never change
        dataset_type=_FLAG_DATASET_TYPE.value,
        tfr_prefix=_FLAG_TFR_PREFIX.value,
        qty_shuffle=1,  # Will never change
        max_length_generation=_FLAG_GENERATION_LENGTH_LIMIT.value)

    def further_prep_generate_not_test(batch):
        batch = tf.boolean_mask(
            batch["input_ids"],
            tf.logical_and(batch["label_ids"] == -100,
                           batch["input_ids"] != tokenizer.eos_token_id))
        return batch

    @tf.function
    def further_prep_generate_test(batch):
        batch = tf.boolean_mask(batch["input_ids"],
                                batch["input_ids"] != tokenizer.eos_token_id)
        return batch

    if _FLAG_SPLIT.value == constants.SplitChoices.test:
        ds = ds.map(further_prep_generate_test)
    else:
        ds = ds.map(further_prep_generate_not_test)

    ds = ds.padded_batch(batch_size=batch_size,
                         padding_values=tokenizer.eos_token_id)
    ds = strategy.experimental_distribute_dataset(ds)

    ##############################################################################
    # Generate
    ##############################################################################
    LOGGER.debug("Generating.")
    generations = []
    counter = tqdm.tqdm(ds,
                        total=task_specific.DATASET_CARDINALITIES["kilt_eli5"][
                            _FLAG_SPLIT.value])

    for batch_no, batch in enumerate(counter):
        output = strategy.run(
            model.generate,
            kwargs=dict(input_ids=batch,
                        max_length=_FLAG_GENERATION_LENGTH_LIMIT.value,
                        use_cache=True,
                        attention_mask=batch == tokenizer.eos_token_id))

        LOGGER.debug("INPUT: %s", tokenizer.decode(batch[0]))
        output = tf_utils.process_strat_output(strategy_outputs=output,
                                               current_batch_size=batch_size,
                                               strategy=strategy,
                                               name="generations")

        with utils.log_duration(LOGGER, "main",
                                "all of tokenizer.decode for a batch."):
            for i in range(batch_size):
                text = tokenizer.decode(output.numpy()[i])
                LOGGER.debug("Batch %d Generation %d", batch_no, i)
                LOGGER.debug(text.replace("\n", " <\\n> "))
                generations.append(text)

        counter.update(batch.shape[0])

    utils.to_json_file(
        os.path.join(_FLAG_OUTPUT_PATH.value, _FLAG_SPLIT.value,
                     _FLAG_APPROACH_TYPE.value,
                     time.strftime("%Y%m%d-%H%M%S.json")),
        dict(flags={
            flag.name: flag.value
            for flag in flags.FLAGS.flags_by_module_dict()[argv[0]]
        },
             generations=generations))
    logging.debug("Saved to: %s", _FLAG_OUTPUT_PATH.value)
예제 #30
0
def main(argv):
    ##############################################################################
    # Initial Setup. Logging, Flags, Random seeds.
    ##############################################################################
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")
    absl_logging.use_python_logging()
    flags_dict = {
        flag.name: flag.value
        for flag in FLAGS.flags_by_module_dict()[argv[0]]
    }

    if FLAGS.use_subset:
        message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}"
                   f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET"
                   f"{colorama.Style.RESET_ALL}")
        LOGGER.warning(message)

    utils.log_module_args(LOGGER, argv[0])
    if not FLAGS.output_dir.startswith("gs://"):
        utils.check_exists(FLAG_OUTPUT_DIR.value)
        if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value):
            raise RuntimeError("Output dir needs to be a directory.")

    tf.random.set_seed(FLAG_RANDOM_SEED.value)
    np.random.seed(FLAG_RANDOM_SEED.value)

    # Prepare the instance output directory path and save the config there
    # Prepare the path
    folder_name = time.strftime(
        f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S")
    instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value,
                                       folder_name).strip()
    if not instance_output_dir.endswith("/"):
        instance_output_dir += "/"
    json_target = os.path.join(instance_output_dir, "training_params.json")

    # Make the folder if we're not on gcloud
    if not json_target.strip().startswith("gs://"):
        subprocess.check_call(["mkdir", "-p", instance_output_dir])

    # Safe the config file
    utils.to_json_file(json_target, flags_dict)

    ##############################################################################
    # Initialization and Configuration of the Devices.
    ##############################################################################
    tpu_setup = None

    accel = tf_utils.current_accelerator_type()
    if FLAG_TPU_IS_LOCAL.value:
        assert accel == "TPU", accel
    if accel == "TPU":
        assert FLAG_TPU_IS_LOCAL.value, FLAG_TPU_IS_LOCAL.value

    if tf_utils.current_accelerator_type() in {"CPU", "TPU"}:
        tpu_setup = tf_utils.init_tpus(tpu_name=FLAG_TPU_NAME.value,
                                       local=FLAG_TPU_IS_LOCAL.value)

    LOGGER.debug("Devices we are computing on:\n%s",
                 utils.wrap_iterable(map(str, tf_utils.devices_to_use())))
    LOGGER.debug("All devices:")
    LOGGER.debug(tf_utils.device_mapping())

    if tf_utils.current_accelerator_type() == "GPU":
        tf.config.set_soft_device_placement(True)

    if tf_utils.current_accelerator_type() != "TPU":
        tf.debugging.set_log_device_placement(True)

    utils.check_operator(operator.ne, tf_utils.current_accelerator_type(),
                         "CPU")

    assert FLAG_TPU_NAME.value == socket.gethostname(), (
        "This is a configuration choice. You can remove this. "
        "There will be no side effects.")

    if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES:
        actual_num_replicas = len(tf_utils.devices_to_use())
    elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC:
        actual_num_replicas = FLAG_NUM_REPLICAS.value
    else:
        actual_num_replicas = 1

    ##############################################################################
    # We load the retriever model if it is needed.
    ##############################################################################
    # Not currently used. See old commits.
    retriever = None

    ##############################################################################
    # Distributed training task
    ##############################################################################
    if FLAG_TASK.value == constants.TaskChoices.train:
        with utils.log_duration(LOGGER, "main", "Load model"):
            utils.print_mem("before loading model", LOGGER)
            model_specific = task_specific.load_model(
                FLAG_MODEL_KEY.value, FLAG_DISTRIBUTE_MODE.value, tpu_setup,
                FLAG_NUM_REPLICAS.value)
            utils.print_mem("after loading model", LOGGER)
            model = model_specific.model
            if isinstance(model, list):
                model: List[transformers.TFGPT2LMHeadModel]
            else:
                model: transformers.TFGPT2LMHeadModel

            tokenizer = model_specific.tokenizer

            def make_optimizer():
                if FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adafactor:
                    return tensor2tensor.utils.adafactor.AdafactorOptimizer(
                        learning_rate=FLAG_LEARNING_RATE.value)
                elif FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adam:
                    return tf.keras.optimizers.Adam(
                        learning_rate=FLAG_LEARNING_RATE.value)
                else:
                    raise ValueError(FLAG_OPTIMIZER_TYPE.value)

            if model_specific.strategy:
                with model_specific.strategy.scope():
                    optimizer = make_optimizer()
            else:
                optimizer = make_optimizer()

        ############################################################################
        # Prepare the dataset functions
        ############################################################################
        rg = np.random.default_rng(FLAG_RANDOM_SEED.value)

        def call_lm_preproc(repeat, split, random_seed):
            """Using functools.partial prevents the linter from doing its job."""
            if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5:
                return task_specific.create_lm_ds_kilt_eli5(
                    tokenizer=tokenizer,
                    context_window_size=model.config.n_positions,
                    dataset_name=FLAG_DATASET_NAME.value,
                    # Batches are split over the replicas:
                    batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas,
                    db_path=FLAG_DB_PATH.value,
                    random_seed=random_seed,
                    use_subset=FLAG_USE_SUBSET.value,
                    subset_size=FLAG_SUBSET_SIZE.value,
                    use_helper_words=FLAG_USE_HELPER_WORDS.value,
                    approach_type=FLAG_APPROACH_TYPE.value,
                    num_retrievals=FLAG_NUM_RETRIEVALS.value,
                    retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value,
                    retriever=retriever,
                    repeat=repeat,
                    split=split,
                    enable_debug_checks=FLAG_DATASET_DEBUG.value,
                    retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value,
                    dataset_type=FLAG_DATASET_TYPE.value,
                    qty_shuffle=FLAG_QTY_SHUFFLE.value,
                    tfr_prefix=FLAG_TFR_PREFIX.value,
                    max_length_generation=FLAG_MAX_LENGTH_GENERATION.value,
                )
            else:
                raise NotImplementedError(
                    f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`"
                )

        make_training_dataset: Callable[...,
                                        tf.data.Dataset] = functools.partial(
                                            call_lm_preproc,
                                            split="train",
                                            repeat=False,
                                        )
        make_eval_dataset: Callable[..., tf.data.Dataset] = functools.partial(
            call_lm_preproc,
            split="eval",
            repeat=True,
        )

        ############################################################################
        # Prepare the step functions
        ############################################################################
        utils.check_contained(FLAG_DISTRIBUTE_MODE.value,
                              constants.DistributeModeChoices.choices())
        tf_function_flags = dict(
            experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value,
            experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value)

        training_step = build_regular_training_step(
            model,
            optimizer,
            strategy=model_specific.strategy,
            tf_function_kwargs=tf_function_flags)

        evaluation_step = build_evaluation_step(model, tf_function_flags)

        timestamp_last_ckpt_secs = time.time()
        # Model checkpoints are saved to the tmp_directory and then rsynced to GCS

        ############################################################################
        # Prepare the statistics and the logging facilities.
        ############################################################################
        # Tensorboard
        with model_specific.strategy.scope():
            checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
        saver = Saver(instance_output_dir, checkpoint)
        train_log_dir = os.path.join(instance_output_dir, "tensorboard",
                                     "train")
        eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval")
        flags_log_dir = os.path.join(instance_output_dir, "tensorboard",
                                     "params")
        writers = dict(train=tf.summary.create_file_writer(train_log_dir),
                       eval=tf.summary.create_file_writer(eval_log_dir),
                       flags=tf.summary.create_file_writer(flags_log_dir))
        with writers["flags"].as_default():
            tf.summary.text(
                "Flags",
                # Tensorboard takes Markdown:
                json.dumps(flags_dict, indent=4).replace("\n", "\n\n"),
                step=0)

        # Different information to log.
        ma_loss = dict(train=utils.MovingAverage(0.9),
                       eval=utils.MovingAverage(0.9))
        step_counters = dict(train=0, eval=0)
        batch_counters = dict(train=0, eval=0)
        prev_batch_end = time.time()

        ############################################################################
        # Create the Eval DS object.
        # ==========================================================================
        # The eval ds has no real concept of epoch, repeats forever, shuffling
        # each time it reaches its end.
        ############################################################################
        # Create
        with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"):
            eval_ds_instance = make_eval_dataset(random_seed=rg.integers(
                -2**63, 2**63 - 1), )
        # Maybe distribute
        LOGGER.debug("Distributing the eval dataset to the replicas.")
        if FLAG_DATASET_TYPE.value == "tfr":
            eval_ds_instance = (
                model_specific.strategy.experimental_distribute_dataset(
                    eval_ds_instance))
        # Start the iteration. We step by calling `next(...)`.
        LOGGER.debug("Done distributing the eval dataset to the replicas.")
        eval_ds_instance = iter(eval_ds_instance)
        step_function = dict(train=training_step, eval=evaluation_step)

        ############################################################################
        # Training Loop
        # ==========================================================================
        # Create a new training dataset object that lasts for one epoch.
        # This is different from the eval training dataset object, which loops
        # forever.
        ############################################################################
        for epoch in itertools.count():
            ##########################################################################
            # Epoch Setup
            ##########################################################################
            LOGGER.debug("EPOCH %d START", epoch)
            # Shuffle differently every epoch
            with utils.log_duration(LOGGER, "main",
                                    "All of make_training_dataset"):
                train_ds_instance = make_training_dataset(
                    random_seed=rg.integers(-2**63, 2**63 - 1), )
            LOGGER.debug(
                "Attempting to distribute the training dataset to the replicas."
            )
            if FLAG_DATASET_TYPE.value == "tfr":
                train_ds_instance = (
                    model_specific.strategy.experimental_distribute_dataset(
                        train_ds_instance))

            LOGGER.debug(
                "Done distributing the training dataset to the replicas.")
            train_ds_instance = iter(train_ds_instance)

            # To change splits, we use `itertools.islice` over the dataset generator.
            # When the training dataset generator is done, a new loop of the following
            # while loop occurs, but no training batch is done because we are taking
            # an `islice` of a generator that is done.
            did_at_least_one_training_batch = True
            split = "eval"
            while did_at_least_one_training_batch:
                utils.check_operator(operator.ne,
                                     tf_utils.current_accelerator_type(),
                                     "CPU")

                # Invert split
                if split == "train":
                    split = "eval"
                else:
                    split = "train"

                # Prepare to test if we did at least one training batch
                if split == "train":
                    did_at_least_one_training_batch = False

                ########################################################################
                # Take slices from the dataset iterator
                # ======================================================================
                # We only want to do a certain number of batches before switching splits
                # We do this by using an `itertools.islice` of the dataset iterators.
                ########################################################################
                if split == "train":
                    dataset_iterator = toolz.take(
                        FLAG_BATCHES_BETWEEN_EVALS.value, train_ds_instance)
                else:
                    # The evaluation dataset generator is infinite, reshuffles everytime
                    # it gets to its end.
                    # Still, we take a fixed size slice form that infinite generator.
                    dataset_iterator = toolz.take(
                        FLAG_NUMBER_EVAL_BATCHES.value, eval_ds_instance)

                LOGGER.debug("Batching")
                for batch in dataset_iterator:
                    if FLAG_LOG_SAMPLES.value:
                        ####################################################################
                        # Print elements of the dataset
                        ####################################################################
                        # Make ourselves resistant to values possibly being a PerReplica
                        # object
                        LOGGER.warning(
                            f"%(red)sLOGGING SAMPLES. THIS IS VERY SLOW.%(reset)s",
                            dict(
                                red=colorama.Fore.RED,
                                reset=colorama.Style.RESET_ALL,
                            ))
                        is_distributed = isinstance(batch["input_ids"],
                                                    values.PerReplica)
                        for in_batch_idx in range(FLAG_BATCH_SIZE.value):
                            for replica_idx in (range(actual_num_replicas)
                                                if is_distributed else [0]):
                                if is_distributed:
                                    sample = {
                                        k: batch[k].values[replica_idx]
                                        for k in batch
                                    }
                                else:
                                    sample = batch

                                # input_sentence = tokenizer.decode(
                                #   [x for x in sample["input_ids"][i] if x != tokenizer.eos_token_id]
                                # )

                                # LOGGER.debug(
                                #   "%sInput [%d / %d]%s:\n\"%s\"",
                                #   colorama.Fore.GREEN,
                                #   replica_idx + 1,
                                #   actual_num_replicas,
                                #   colorama.Style.RESET_ALL,
                                #   input_sentence,
                                # )
                                #
                                # answer = tokenizer.decode(
                                #   [(x if x != -100 else 0) for x in sample["label_ids"][i]]
                                # )
                                # LOGGER.debug(
                                #   "%sLabel [%d / %d]%s:\n\"%s\"",
                                #   colorama.Fore.GREEN,
                                #   replica_idx + 1,
                                #   actual_num_replicas,
                                #   colorama.Style.RESET_ALL,
                                #   answer,
                                # )

                                cons = console.Console()
                                sentences = table.Table()
                                sentences.add_column("BPE Index",
                                                     justify="center")
                                sentences.add_column("Inputs",
                                                     justify="center")
                                sentences.add_column("Labels",
                                                     justify="center")
                                for bpe_idx, (x, y) in enumerate(
                                        itertools.zip_longest(
                                            sample["input_ids"]
                                            [in_batch_idx].numpy(),
                                            sample["label_ids"]
                                            [in_batch_idx].numpy(),
                                            fillvalue=None,
                                        )):
                                    x_w = tokenizer.decode(
                                        [x]) if x >= 0 else f"[ {x} ]"
                                    y_w = tokenizer.decode(
                                        [y]) if y >= 0 else f"[ {y} ]"
                                    sentences.add_row(str(bpe_idx), x_w, y_w)

                                cons.print(sentences)

                    # We only care about training epochs as, obviously, we don't train
                    # over eval samples; the number of  eval samples seen only
                    # contributes to lowering the variance in the evaluation of when to
                    # do early stopping.
                    if split == "train":
                        did_at_least_one_training_batch = True

                    input_ids = batch["input_ids"]
                    label_ids = batch["label_ids"]

                    # Per split step counter
                    step_counters[
                        split] += FLAG_BATCH_SIZE.value * actual_num_replicas
                    batch_counters[split] += 1

                    ######################################################################
                    # Model step function.
                    ######################################################################
                    step_function_kwargs = dict(
                        input_ids=input_ids,
                        label_ids=label_ids,
                    )

                    utils.print_mem(f"[{split}] - Mem before `strategy.run`",
                                    LOGGER)
                    LOGGER.debug("[%s] - Calling `strategy.run`", split)
                    loss = model_specific.strategy.run(
                        step_function[split], kwargs=step_function_kwargs)
                    LOGGER.debug("[%s] - Done `strategy.run`", split)
                    utils.print_mem(f"[{split}] - Mem after `strategy.run`",
                                    LOGGER)

                    ####################################################################
                    # End of logging step code / Logging and saving the model.
                    ####################################################################
                    if (FLAG_DISTRIBUTE_MODE.value
                            in constants.PURE_DATA_PARALLEL_STRATEGIES):
                        utils.check_equal(len(loss.values),
                                          actual_num_replicas)
                        LOGGER.debug("[%s] - Real num replicas: %s", split,
                                     actual_num_replicas)
                        average_loss = float(
                            tf.math.reduce_mean(loss.values).numpy())

                        LOGGER.debug("[%s] - Loss: %s", str(split),
                                     str(average_loss))

                    else:
                        average_loss = float(loss.numpy())

                    tf.debugging.check_numerics(
                        loss.values if isinstance(loss, values.PerReplica) else
                        loss, "Numerics failed.")

                    now = time.time()
                    batch_duration = now - prev_batch_end
                    prev_batch_end = now
                    ma_loss[split].update(average_loss)

                    LOGGER.info("[%s] - Epoch: # %d", split, epoch)
                    LOGGER.info("[%s] - Tensorboard_dir: %s", split,
                                instance_output_dir)
                    LOGGER.info("[%s] - Batch: # %d", split,
                                batch_counters[split])
                    LOGGER.info("[%s] - Step:  # %d", split,
                                step_counters[split])
                    if FLAG_USE_SUBSET.value:
                        LOGGER.warning(">> USING A SUBSET OF THE DATASET <<")
                    LOGGER.info(
                        "[%(split)s] - Batch loss:           %(metric)f",
                        dict(split=split, metric=average_loss))
                    LOGGER.info(
                        "[%(split)s] - Moving average loss:  %(metric)f",
                        dict(split=split, metric=ma_loss[split].average))
                    LOGGER.info(
                        "[%(split)s] - Moving average ppl:   %(metric)f",
                        dict(split=split,
                             metric=np.exp(ma_loss[split].average)))
                    LOGGER.info(
                        "[%(split)s] - Batch duration:       %(duration)s",
                        dict(split=split,
                             duration=utils.TimeStamp.from_seconds(
                                 batch_duration).format()))

                    # Write to Tensorboard
                    with writers[split].as_default():
                        tf.summary.scalar(f"Loss/{split}", average_loss,
                                          step_counters[split])
                        tf.summary.scalar(f"PPL/{split}", np.exp(average_loss),
                                          step_counters[split])
                    writers[split].flush()

                    ######################################################################
                    # Save every `FLAG_SAVE_PERIOD_MIN.value` minutes.
                    ######################################################################
                    delta_sec = time.time() - timestamp_last_ckpt_secs
                    utils.check_operator(operator.gt, delta_sec, 0)
                    period_sec = 60 * FLAG_SAVE_PERIOD_MIN.value
                    utils.check_operator(operator.gt, period_sec, 0)
                    ratio = delta_sec / period_sec
                    LOGGER.info(
                        "[%(split)s] - RATIO:                  %(ratio)s",
                        dict(split=split, ratio=str(ratio)))
                    LOGGER.info(
                        "[%(split)s] - Target: %(target)s, Present: %(present)s",
                        dict(
                            split=split,
                            target=str(period_sec),
                            present=str(delta_sec),
                        ))

                    if ratio >= 1:
                        dur = delta_sec / 60
                        timestamp_last_ckpt_secs = time.time()
                        LOGGER.debug(
                            "SAVING MODEL - CAUSE: DURATION - %0.2f min", dur)
                        # checkpoint.save(ckpt_prefix)
                        saver.save_model(
                            train_steps=step_counters["train"],
                            model_or_replicas=model,
                            optimizer=optimizer,
                        )

        ############################################################################
        # Post Training Cleanup
        ############################################################################
        for writer in writers.values():
            writer.close()
예제 #31
0
__credits__ = ["???"]
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"

from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("6 System Access, Authentication and Authorization")

#
print_header("6.1 Configure cron and anacron")

#
print_header("6.1.1 Enable anacron Daemon (Scored)")
check_equal("rpm -q anacron", "package anacron is not installed")
print_info("Not installed syco servers.")

print_header("6.1.2 Enable crond Daemon (Scored)")
check_equal_re("chkconfig --list crond",
               "crond.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off")

#
print_header(
    "6.1.3 Set User/Group Owner and Permission on /etc/anacrontab (Scored)")
check_equal('stat -c "%a %u %g" /etc/anacrontab | egrep "600 0 0"', "600 0 0")

#
print_header(
    "6.1.4 Set User/Group Owner and Permission on /etc/crontab (Scored)")
check_equal('stat -c "%a %u %g" /etc/crontab | egrep "600 0 0"', "600 0 0")
예제 #32
0
파일: cis5.py 프로젝트: ysoldak/syco
import config

#
print_header("5 Logging and Auditing")

#
print_header("5.1 Configure Syslog")

#
print_header("5.1.1 Install the rsyslog package (Scored)")
check_equal_re("rpm -q rsyslog", "rsyslog.*")

#
print_header("5.1.2 Activate the rsyslog Service (Scored)")
check_equal("rpm -q syslog", "package syslog is not installed")
check_empty("chkconfig --list | grep syslog")
check_equal_re("chkconfig --list rsyslog",
               "rsyslog.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off")

#
print_header("5.1.3 Configure /etc/rsyslog.conf (Not Scored)")
print_warning(
    "Manually review the contents of the /etc/rsyslog.conf file to ensure appropriate logging is set. "
)
view_output("ls -l /var/log/")

#
print_header("5.1.4 Create and Set Permissions on rsyslog Log Files (Scored)")
print_header(" TODO - Ensure that the log files are logging information")
예제 #33
0
파일: cis7.py 프로젝트: Nemie/syco
__version__ = "1.0.0"
__status__ = "Production"


from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("7. User Accounts and Environment")

#
print_header("7.1 Set Shadow Password Suite Parameters (/etc/login.defs)")

#
print_header("7.1.1 Set Password Expiration Days (Scored)")
check_equal(
    "grep ^PASS_MAX_DAYS /etc/login.defs",
    "PASS_MAX_DAYS\t90"
)

check_empty(
    'awk -F: \'($3 > 0) {print $1}\' /etc/passwd | xargs -I {} ' +
    'chage --list {}|' +
    'grep "^Maximum number of days between password change"|'+
    'grep -v ": 99$"'
)

#
print_header("7.1.2 Set Password Change Minimum Number of Days (Scored)")
check_equal(
    "grep ^PASS_MIN_DAYS /etc/login.defs",
    "PASS_MIN_DAYS\t7"
)
예제 #34
0
파일: cis9.py 프로젝트: ysoldak/syco
from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info

#
print_header("9 System Maintenance")

#
print_header("9.1 Verify System File Permissions)")

#
print_header("9.1.1 Verify System File Permissions (Not Scored)")
print_warning("Check manually for changed files.")
view_output("rpm -Va --nomtime --nosize --nomd5 --nolinkto")

#
print_header("9.1.2 Verify Permissions on /etc/passwd (Scored)")
check_equal('stat -c "%a %u %g" /etc/passwd | egrep "644 0 0"', "644 0 0")

#
print_header("9.1.3 Verify Permissions on /etc/shadow (Scored)")
check_equal('stat -c "%a %u %g" /etc/shadow | egrep "0 0 0"', "0 0 0")

#
print_header("9.1.4 Verify Permissions on /etc/gshadow (Scored)")
check_equal('stat -c "%a %u %g" /etc/gshadow | egrep "0 0 0"', "0 0 0")

#
print_header("9.1.5 Verify Permissions on /etc/group (Scored)")
check_equal('stat -c "%a %u %g" /etc/group | egrep "644 0 0"', "644 0 0")

#
print_header("9.1.6 Verify User/Group Ownership on /etc/passwd (Scored)")
예제 #35
0
def main(argv):
    if len(argv) > 1:
        raise RuntimeError(argv[1:])
    absl_logging.use_python_logging()
    utils.check_contained(_FLAG_APPROACH_TYPE.value, _ACCEPTABLE_APPROACHES)

    utils.check_operator(operator.xor, bool(_FLAG_H5_MODEL_PATH.value),
                         bool(_FLAG_CKPT_MODEL_PATH.value))

    if _FLAG_H5_MODEL_PATH.value:
        model_path = _FLAG_H5_MODEL_PATH.value
        mode = constants.SaveModeChoices.hfh5
    elif _FLAG_CKPT_MODEL_PATH.value:
        model_path = _FLAG_CKPT_MODEL_PATH.value
        mode = constants.SaveModeChoices.ckpt
    else:
        raise RuntimeError("Logically should never happen.")

    utils.check_exists(model_path)
    device_type = tf_utils.devices_to_use()[0].device_type

    # ONLY GPU IS SUPPORTED
    utils.check_equal(device_type, "GPU")

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Build the distribution strategy
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if device_type == "TPU":
        # ONLY LOCAL TPU IS "SUPPORTED"
        utils.check_isinstance(_FLAG_IS_LOCAL_TPU.value, bool)
        assert _FLAG_IS_LOCAL_TPU.value
        tpu_config = tf_utils.init_tpus(local=True)
        utils.check_isinstance(tpu_config, tf_utils.TpuConfigType)
        utils.check_not_none(tpu_config)
        strategy = tf.distribute.TPUStrategy(tpu_config.resolver)
    elif device_type == "GPU":
        strategy = tf.distribute.MirroredStrategy(
            devices=tf.config.experimental.list_logical_devices('GPU'))
    else:
        raise RuntimeError(device_type)

    # ONLY GPU IS SUPPORTED
    print(tf.config.list_logical_devices())
    utils.check_isinstance(strategy, tf.distribute.MirroredStrategy)

    ##############################################################################
    # Load Model
    ##############################################################################
    with utils.log_duration(LOGGER, main.__name__, "All of model preparation"):
        with strategy.scope():
            # HF isn't able to read directly from GCS
            if (model_path.startswith("gs://")
                    and mode == constants.SaveModeChoices.hfh5):
                with utils.log_duration(LOGGER, main.__name__,
                                        "Download model from GS"):
                    with tempfile.TemporaryDirectory() as td:
                        td += os.path.sep

                        if os.path.exists("/root/google-cloud-sdk/bin/gsutil"):
                            exec_ = "/root/google-cloud-sdk/bin/gsutil"
                        else:
                            exec_ = "gsutil"

                        command = [
                            exec_,
                            "-m",
                            "cp",
                            "-r",
                            os.path.join(model_path, "*"),
                            td,
                        ]
                        LOGGER.debug("Running bash command: %s",
                                     " ".join(command))
                        subprocess.check_call(command)
                        LOGGER.debug("Files at the temp dir(%s): %s", td,
                                     str(os.listdir(td)))

                        model = make_model_tf(td, mode=mode)
            else:
                model = make_model_tf(model_path, mode=mode)

    utils.check_not_none(model)

    ##############################################################################
    # Load Dataset Pipeline
    ##############################################################################
    utils.check_contained(
        _FLAG_APPROACH_TYPE.value, {
            constants.ApproachTypeChoices.naked_lm,
            constants.ApproachTypeChoices.cached_pretok
        })
    devices = tf_utils.devices_to_use()
    num_replicas = (len(devices)
                    if devices[0].device_type in {"GPU", "TPU"} else 1)
    utils.check_equal(devices[0].device_type, "GPU")

    # Only a batch size of 1 is currently supported. We need attention masks
    batch_size = _FLAG_BATCH_SIZE.value * num_replicas
    approach_type = _FLAG_APPROACH_TYPE.value

    logging.debug("Loading dataset.")
    tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl")
    ds = prep_ds_for_generation(
        dict(
            tokenizer=tokenizer,
            context_window_size=1024,
            dataset_name="kilt_eli5",
            batch_size=1,  # >> We set our own batch size elsewhere
            db_path=None,  # None,
            random_seed=0,
            use_subset=False,
            subset_size=-1,
            use_helper_words=True,
            approach_type=approach_type,
            num_retrievals=5,  # Will never change
            retrieval_temperature=1.,
            retriever=None,  # Cached retrievals don't need a retriever
            repeat=False,  # Will never change
            split=_FLAG_SPLIT.value,
            enable_debug_checks=False,
            retrieval_bank_size=5,  # Will never change
            dataset_type=_FLAG_DATASET_TYPE.value,
            tfr_prefix=_FLAG_TFR_PREFIX.value,
            qty_shuffle=1,  # Will never change
            max_length_generation=350),
        tokenizer,
        _FLAG_SPLIT.value)

    ds = strategy.experimental_distribute_dataset(ds)

    ##############################################################################
    # Generate
    ##############################################################################
    LOGGER.debug("Generating.")
    generations = []
    num_entries_in_split = (
        task_specific.DATASET_CARDINALITIES["kilt_eli5"][_FLAG_SPLIT.value])

    entries_counter = tqdm.tqdm(total=num_entries_in_split)

    for batch_no, batch in enumerate(ds):
        # Calling model.generate. We should make a config file with the
        # hyperparameters for generation, or make a facility in the one we already
        # have. I feel like a separate one would be better, separating concerns.
        output = strategy.run(
            model.generate,
            kwargs=dict(
                input_ids=batch,
                max_length=_FLAG_GENERATION_LENGTH_LIMIT.value,
                use_cache=True,
                attention_mask=tf.cast(batch != tokenizer.eos_token_id,
                                       tf.int32),
                repetition_penalty=2.,
                num_beams=5,
            ))
        output = tf_utils.process_strat_output(strategy_outputs=output,
                                               current_batch_size=batch_size,
                                               strategy=strategy,
                                               name="generations")

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Display the inputs and outputs.
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        rich_console = rich.console.Console(color_system="256")
        print_sample = make_print_sample()

        with utils.log_duration(LOGGER, "main",
                                "all of tokenizer.decode for a batch."):
            for i in range(batch_size):
                input_text = tokenizer.decode(batch.numpy()[i])
                output_text = tokenizer.decode(output.numpy()[i])
                print("#" * 1000)
                print(f"Batch {batch_no} Generation {i}")
                print_sample(input_text, f"input batch_no {batch_no}",
                             rich_console)
                print_sample(output_text, f"output batch_no {batch_no}",
                             rich_console)
                generations.append(output_text)
            print("#" * 1000)
        entries_counter.update(batch.shape[0])

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Save the output to a JSON File.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    utils.to_json_file(
        os.path.join(_FLAG_OUTPUT_PATH.value, _FLAG_SPLIT.value,
                     _FLAG_APPROACH_TYPE.value,
                     time.strftime("%Y%m%d-%H%M%S.json")),
        dict(flags={
            flag.name: flag.value
            for flag in flags.FLAGS.flags_by_module_dict()[argv[0]]
        },
             generations=generations))
    logging.debug("Saved to: %s", _FLAG_OUTPUT_PATH.value)
예제 #36
0
파일: cis8.py 프로젝트: ysoldak/syco
__license__ = "???"
__version__ = "1.0.0"
__status__ = "Production"


from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info
import app

#
print_header("8 Warning Banners")

#
print_header("8.1 Set Warning Banner for Standard Login Services (Scored)")
check_empty("diff %s/hardening/issue.net /etc/motd" % app.SYCO_VAR_PATH)
check_empty("diff %s/hardening/issue.net /etc/issue" % app.SYCO_VAR_PATH)
check_empty("diff %s/hardening/issue.net /etc/issue.net" % app.SYCO_VAR_PATH)

check_equal('stat -c "%a %u %g" /etc/motd | egrep "644 0 0"', "644 0 0")
check_equal('stat -c "%a %u %g" /etc/issue | egrep "644 0 0"', "644 0 0")
check_equal('stat -c "%a %u %g" /etc/issue.net | egrep "644 0 0"', "644 0 0")

#
print_header("8.2 Remove OS Information from Login Warning Banners (Scored)")
check_empty("egrep '(\\\\v|\\\\r|\\\\m|\\\\s)' /etc/issue")
check_empty("egrep '(\\\\v|\\\\r|\\\\m|\\\\s)' /etc/motd")
check_empty("egrep '(\\\\v|\\\\r|\\\\m|\\\\s)' /etc/issue.net")

#
print_header("8.3 Set GNOME Warning Banner (Not Scored)")
print_info("Not using gnome.")