Exemplo n.º 1
0
def encode_tokens_from_list(
    texts: List[str],
    eos_token: str,
    tokenizer: GPT2TokenizerFast,
    batch_size: int = 1024,
) -> List[int]:
    """
    Retrieves texts from a newline-delimited file/CSV and returns texts.
    """

    logger.info(f"Encoding {len(texts):,} texts.")

    pbar = tqdm(
        total=len(texts),
        smoothing=0,
        leave=True,
        dynamic_ncols=True,
    )
    tokens = []
    i_start = 0

    for i_start in range(len(texts) // batch_size + 1):
        batch = [
            text + eos_token
            for text in texts[(i_start * batch_size):((i_start * batch_size) +
                                                      batch_size)]
        ]

        tokens += list(
            itertools.chain.from_iterable(
                tokenizer.batch_encode_plus(
                    batch, add_special_tokens=False)["input_ids"]))

        pbar.update(len(batch))

    pbar.close()
    return tokens
Exemplo n.º 2
0
def encode_tokens_from_list(
    texts: List[str],
    eos_token: str,
    tokenizer: GPT2TokenizerFast,
    progress_bar_refresh_rate: int = 10,
    batch_size: int = 1024,
) -> List[int]:
    """
    Retrieves texts from a newline-delimited file/CSV and returns texts.
    """

    num_texts = len(texts)
    a_dtype = get_dtype(tokenizer.vocab_size)
    logger.info(f"Encoding {num_texts:,} texts.")

    pbar = tqdm(
        total=num_texts,
        smoothing=0,
        leave=True,
        dynamic_ncols=True,
    )
    tokens = np.full((len(texts), 1), -1, dtype=a_dtype)

    for i_start in range(num_texts // batch_size + 1):
        batch = [
            text + eos_token
            for text in texts[(i_start * batch_size):((i_start * batch_size) +
                                                      batch_size)]
        ]

        encoded_texts = tokenizer.batch_encode_plus(
            batch,
            add_special_tokens=False,
            return_token_type_ids=False,
            return_attention_masks=False,
        )["input_ids"]

        for i, encoded_text in enumerate(encoded_texts):
            if len(encoded_text) > tokens.shape[1]:
                cols_to_add = len(encoded_text) - tokens.shape[1]
                tokens = np.concatenate(
                    (
                        tokens,
                        np.full(
                            (num_texts, cols_to_add),
                            -1,
                            dtype=a_dtype,
                        ),
                    ),
                    axis=1,
                )
            tokens[(i_start * batch_size) +
                   i, :len(encoded_text)] = encoded_text

        if i_start % progress_bar_refresh_rate == 0:
            pbar.update(batch_size * progress_bar_refresh_rate)

    pbar.n = num_texts
    pbar.refresh()
    pbar.close()
    tokens = tokens.flatten()
    return tokens[tokens < np.array(-1, dtype=a_dtype)]
Exemplo n.º 3
0
def encode_tokens_from_file(
    file_path: str,
    eos_token: str,
    tokenizer: GPT2TokenizerFast,
    newline: str,
    header: bool = True,
    progress_bar_refresh_rate: int = 10,
    batch_size: int = 1024,
) -> List[int]:
    """
    Retrieves texts from a newline-delimited file/CSV and returns texts.
    """

    is_csv = file_path.endswith(".csv")
    a_dtype = get_dtype(tokenizer.vocab_size)

    if is_csv:
        num_texts = get_lines_in_file_csv(file_path, header)
    else:
        num_texts = get_lines_in_file(file_path, newline)

    pbar = tqdm(
        total=num_texts,
        smoothing=0,
        leave=True,
        dynamic_ncols=True,
    )
    tokens = np.full((num_texts, 1), -1, dtype=a_dtype)
    num_batches = 0

    with open(file_path, "r", encoding="utf-8", newline=newline) as f_load:

        if header:
            f_load.readline()
        if is_csv:
            f_read = csv.reader(f_load)
            logger.info(f"Encoding {num_texts:,} rows from {file_path}.")
        else:
            f_read = f_load
            logger.info(
                f"Encoding {num_texts:,} sets of tokens from {file_path}.")

        # https://stackoverflow.com/a/6335876/9314418
        while True:
            if is_csv:
                batch = [
                    text[0] + eos_token
                    for text in list(itertools.islice(f_read, batch_size))
                ]
            else:
                batch = [
                    text + eos_token
                    for text in list(itertools.islice(f_read, batch_size))
                ]

            if not batch:
                break

            encoded_texts = tokenizer.batch_encode_plus(
                batch,
                add_special_tokens=False,
                return_token_type_ids=False,
                return_attention_masks=False,
            )["input_ids"]

            for i, encoded_text in enumerate(encoded_texts):
                if len(encoded_text) > tokens.shape[1]:
                    cols_to_add = len(encoded_text) - tokens.shape[1]
                    tokens = np.concatenate(
                        (
                            tokens,
                            np.full(
                                (num_texts, cols_to_add),
                                -1,
                                dtype=a_dtype,
                            ),
                        ),
                        axis=1,
                    )
                tokens[(num_batches * batch_size) +
                       i, :len(encoded_text)] = encoded_text

            num_batches += 1

            if num_batches % progress_bar_refresh_rate == 0:
                pbar.update(batch_size * progress_bar_refresh_rate)

    pbar.n = num_texts
    pbar.refresh()
    pbar.close()
    tokens = tokens.flatten()
    return tokens[tokens < np.array(-1, dtype=a_dtype)]
Exemplo n.º 4
0
def encode_tokens_from_file(
    file_path: str,
    eos_token: str,
    tokenizer: GPT2TokenizerFast,
    newline: str,
    header: bool = True,
    batch_size: int = 1024,
) -> List[int]:
    """
    Retrieves texts from a newline-delimited file/CSV and returns texts.
    """

    is_csv = file_path.endswith(".csv")

    if is_csv:
        num_texts = get_lines_in_file_csv(file_path, header)
    else:
        num_texts = get_lines_in_file(file_path, newline)

    pbar = tqdm(
        total=num_texts,
        smoothing=0,
        leave=True,
        dynamic_ncols=True,
    )
    tokens = []

    with open(file_path, "r", encoding="utf-8", newline=newline) as f_load:

        if header:
            f_load.readline()
        if is_csv:
            f_read = csv.reader(f_load)
            logger.info(f"Encoding {num_texts:,} rows from {file_path}.")
        else:
            f_read = f_load
            logger.info(
                f"Encoding {num_texts:,} sets of tokens from {file_path}.")

        # https://stackoverflow.com/a/6335876/9314418
        while True:
            if is_csv:
                batch = [
                    text[0] + eos_token
                    for text in list(itertools.islice(f_read, batch_size))
                ]
            else:
                batch = [
                    text + eos_token
                    for text in list(itertools.islice(f_read, batch_size))
                ]

            if not batch:
                break

            tokens += list(
                itertools.chain.from_iterable(
                    tokenizer.batch_encode_plus(
                        batch, add_special_tokens=False)["input_ids"]))

            pbar.update(len(batch))

    pbar.close()
    return tokens