def encode_tokens_from_list( texts: List[str], eos_token: str, tokenizer: GPT2TokenizerFast, batch_size: int = 1024, ) -> List[int]: """ Retrieves texts from a newline-delimited file/CSV and returns texts. """ logger.info(f"Encoding {len(texts):,} texts.") pbar = tqdm( total=len(texts), smoothing=0, leave=True, dynamic_ncols=True, ) tokens = [] i_start = 0 for i_start in range(len(texts) // batch_size + 1): batch = [ text + eos_token for text in texts[(i_start * batch_size):((i_start * batch_size) + batch_size)] ] tokens += list( itertools.chain.from_iterable( tokenizer.batch_encode_plus( batch, add_special_tokens=False)["input_ids"])) pbar.update(len(batch)) pbar.close() return tokens
def encode_tokens_from_list( texts: List[str], eos_token: str, tokenizer: GPT2TokenizerFast, progress_bar_refresh_rate: int = 10, batch_size: int = 1024, ) -> List[int]: """ Retrieves texts from a newline-delimited file/CSV and returns texts. """ num_texts = len(texts) a_dtype = get_dtype(tokenizer.vocab_size) logger.info(f"Encoding {num_texts:,} texts.") pbar = tqdm( total=num_texts, smoothing=0, leave=True, dynamic_ncols=True, ) tokens = np.full((len(texts), 1), -1, dtype=a_dtype) for i_start in range(num_texts // batch_size + 1): batch = [ text + eos_token for text in texts[(i_start * batch_size):((i_start * batch_size) + batch_size)] ] encoded_texts = tokenizer.batch_encode_plus( batch, add_special_tokens=False, return_token_type_ids=False, return_attention_masks=False, )["input_ids"] for i, encoded_text in enumerate(encoded_texts): if len(encoded_text) > tokens.shape[1]: cols_to_add = len(encoded_text) - tokens.shape[1] tokens = np.concatenate( ( tokens, np.full( (num_texts, cols_to_add), -1, dtype=a_dtype, ), ), axis=1, ) tokens[(i_start * batch_size) + i, :len(encoded_text)] = encoded_text if i_start % progress_bar_refresh_rate == 0: pbar.update(batch_size * progress_bar_refresh_rate) pbar.n = num_texts pbar.refresh() pbar.close() tokens = tokens.flatten() return tokens[tokens < np.array(-1, dtype=a_dtype)]
def encode_tokens_from_file( file_path: str, eos_token: str, tokenizer: GPT2TokenizerFast, newline: str, header: bool = True, progress_bar_refresh_rate: int = 10, batch_size: int = 1024, ) -> List[int]: """ Retrieves texts from a newline-delimited file/CSV and returns texts. """ is_csv = file_path.endswith(".csv") a_dtype = get_dtype(tokenizer.vocab_size) if is_csv: num_texts = get_lines_in_file_csv(file_path, header) else: num_texts = get_lines_in_file(file_path, newline) pbar = tqdm( total=num_texts, smoothing=0, leave=True, dynamic_ncols=True, ) tokens = np.full((num_texts, 1), -1, dtype=a_dtype) num_batches = 0 with open(file_path, "r", encoding="utf-8", newline=newline) as f_load: if header: f_load.readline() if is_csv: f_read = csv.reader(f_load) logger.info(f"Encoding {num_texts:,} rows from {file_path}.") else: f_read = f_load logger.info( f"Encoding {num_texts:,} sets of tokens from {file_path}.") # https://stackoverflow.com/a/6335876/9314418 while True: if is_csv: batch = [ text[0] + eos_token for text in list(itertools.islice(f_read, batch_size)) ] else: batch = [ text + eos_token for text in list(itertools.islice(f_read, batch_size)) ] if not batch: break encoded_texts = tokenizer.batch_encode_plus( batch, add_special_tokens=False, return_token_type_ids=False, return_attention_masks=False, )["input_ids"] for i, encoded_text in enumerate(encoded_texts): if len(encoded_text) > tokens.shape[1]: cols_to_add = len(encoded_text) - tokens.shape[1] tokens = np.concatenate( ( tokens, np.full( (num_texts, cols_to_add), -1, dtype=a_dtype, ), ), axis=1, ) tokens[(num_batches * batch_size) + i, :len(encoded_text)] = encoded_text num_batches += 1 if num_batches % progress_bar_refresh_rate == 0: pbar.update(batch_size * progress_bar_refresh_rate) pbar.n = num_texts pbar.refresh() pbar.close() tokens = tokens.flatten() return tokens[tokens < np.array(-1, dtype=a_dtype)]
def encode_tokens_from_file( file_path: str, eos_token: str, tokenizer: GPT2TokenizerFast, newline: str, header: bool = True, batch_size: int = 1024, ) -> List[int]: """ Retrieves texts from a newline-delimited file/CSV and returns texts. """ is_csv = file_path.endswith(".csv") if is_csv: num_texts = get_lines_in_file_csv(file_path, header) else: num_texts = get_lines_in_file(file_path, newline) pbar = tqdm( total=num_texts, smoothing=0, leave=True, dynamic_ncols=True, ) tokens = [] with open(file_path, "r", encoding="utf-8", newline=newline) as f_load: if header: f_load.readline() if is_csv: f_read = csv.reader(f_load) logger.info(f"Encoding {num_texts:,} rows from {file_path}.") else: f_read = f_load logger.info( f"Encoding {num_texts:,} sets of tokens from {file_path}.") # https://stackoverflow.com/a/6335876/9314418 while True: if is_csv: batch = [ text[0] + eos_token for text in list(itertools.islice(f_read, batch_size)) ] else: batch = [ text + eos_token for text in list(itertools.islice(f_read, batch_size)) ] if not batch: break tokens += list( itertools.chain.from_iterable( tokenizer.batch_encode_plus( batch, add_special_tokens=False)["input_ids"])) pbar.update(len(batch)) pbar.close() return tokens