def strip_token(str_, token, is_token_list=False, compat=True): """Returns a copy of strings with leading and trailing tokens removed. Note that besides :attr:`token`, all leading and trailing whitespace characters are also removed. If :attr:`is_token_list` is False, then the function assumes tokens in :attr:`str_` are separated with whitespace character. Args: str\_: A `str`, or an `n`-D numpy array or (possibly nested) list of `str`. token (str): The token to strip, e.g., the '<PAD>' token defined in :class:`~texar.tf.data.SpecialTokens`.PAD is_token_list (bool): Whether each sentence in :attr:`str_` is a list of tokens. If False, each sentence in :attr:`str_` is assumed to contain tokens separated with space character. compat (bool): Whether to convert tokens into `unicode` (Python 2) or `str` (Python 3). Returns: The stripped strings of the same structure/shape as :attr:`str_`. Example: .. code-block:: python str_ = '<PAD> a sentence <PAD> <PAD> ' str_stripped = strip_token(str_, '<PAD>') # str_stripped == 'a sentence' str_ = ['<PAD>', 'a', 'sentence', '<PAD>', '<PAD>', '', ''] str_stripped = strip_token(str_, '<PAD>', is_token_list=True) # str_stripped == 'a sentence' """ def _recur_strip(s): if is_str(s): if token == "": return ' '.join(s.strip().split()) else: return ' '.join(s.strip().split()).\ replace(' '+token, '').replace(token+' ', '') else: s_ = [_recur_strip(si) for si in s] return _maybe_list_to_array(s_, s) s = str_ if compat: s = compat_as_text(s) if is_token_list: s = str_join(s, compat=False) strp_str = _recur_strip(s) if is_token_list: strp_str = _recur_split(strp_str, str_) return strp_str
def str_join(tokens, sep=' ', compat=True): """Concats :attr:`tokens` along the last dimension with intervening occurrences of :attr:`sep`. Args: tokens: An `n`-D numpy array or (possibly nested) list of `str`. sep (str): The string intervening between the tokens. compat (bool): Whether to convert tokens into `unicode` (Python 2) or `str` (Python 3). Returns: An `(n-1)`-D numpy array (or list) of `str`. """ def _recur_join(s): if len(s) == 0: return '' elif is_str(s[0]): return sep.join(s) else: s_ = [_recur_join(si) for si in s] return _maybe_list_to_array(s_, s) if compat: tokens = compat_as_text(tokens) str_ = _recur_join(tokens) return str_
def strip_bos(str_, bos_token='<BOS>', is_token_list=False, compat=True): """Remove all leading BOS tokens. Note that besides :attr:`bos_token`, all leading and trailing whitespace characters are also removed. If :attr:`is_token_list` is False, then the function assumes tokens in :attr:`str_` are separated with whitespace character. Args: str\_: A `str`, or an `n`-D numpy array or (possibly nested) list of `str`. bos_token (str): The BOS token. Default is '<BOS>' as defined in :class:`~texar.tf.data.SpecialTokens`.BOS is_token_list (bool): Whether each sentence in :attr:`str_` is a list of tokens. If False, each sentence in :attr:`str_` is assumed to contain tokens separated with space character. compat (bool): Whether to convert tokens into `unicode` (Python 2) or `str` (Python 3). Returns: Strings of the same structure/shape as :attr:`str_`. """ def _recur_strip(s): if is_str(s): if bos_token == '': return ' '.join(s.strip().split()) else: return ' '.join(s.strip().split()).replace(bos_token + ' ', '') else: s_ = [_recur_strip(si) for si in s] return _maybe_list_to_array(s_, s) s = str_ if compat: s = compat_as_text(s) if is_token_list: s = str_join(s, compat=False) strp_str = _recur_strip(s) if is_token_list: strp_str = _recur_split(strp_str, str_) return strp_str
def strip_eos(str_, eos_token='<EOS>', is_token_list=False, compat=True): """Remove the EOS token and all subsequent tokens. If :attr:`is_token_list` is False, then the function assumes tokens in :attr:`str_` are separated with whitespace character. Args: str\_: A `str`, or an `n`-D numpy array or (possibly nested) list of `str`. eos_token (str): The EOS token. Default is '<EOS>' as defined in :class:`~texar.tf.data.SpecialTokens`.EOS is_token_list (bool): Whether each sentence in :attr:`str_` is a list of tokens. If False, each sentence in :attr:`str_` is assumed to contain tokens separated with space character. compat (bool): Whether to convert tokens into `unicode` (Python 2) or `str` (Python 3). Returns: Strings of the same structure/shape as :attr:`str_`. """ def _recur_strip(s): if is_str(s): s_tokens = s.split() if eos_token in s_tokens: return ' '.join(s_tokens[:s_tokens.index(eos_token)]) else: return s else: s_ = [_recur_strip(si) for si in s] return _maybe_list_to_array(s_, s) s = str_ if compat: s = compat_as_text(s) if is_token_list: s = str_join(s, compat=False) strp_str = _recur_strip(s) if is_token_list: strp_str = _recur_split(strp_str, str_) return strp_str
def map_ids_to_strs(ids, vocab, join=True, strip_pad='<PAD>', strip_bos='<BOS>', strip_eos='<EOS>', compat=True): """Transforms `int` indexes to strings by mapping ids to tokens, concatenating tokens into sentences, and stripping special tokens, etc. Args: ids: An n-D numpy array or (possibly nested) list of `int` indexes. vocab: An instance of :class:`~texar.tf.data.Vocab`. join (bool): Whether to concat along the last dimension of the the tokens into a string separated with a space character. strip_pad (str): The PAD token to strip from the strings (i.e., remove the leading and trailing PAD tokens of the strings). Default is '<PAD>' as defined in :class:`~texar.tf.data.SpecialTokens`.PAD. Set to `None` or `False` to disable the stripping. strip_bos (str): The BOS token to strip from the strings (i.e., remove the leading BOS tokens of the strings). Default is '<BOS>' as defined in :class:`~texar.tf.data.SpecialTokens`.BOS. Set to `None` or `False` to disable the stripping. strip_eos (str): The EOS token to strip from the strings (i.e., remove the EOS tokens and all subsequent tokens of the strings). Default is '<EOS>' as defined in :class:`~texar.tf.data.SpecialTokens`.EOS. Set to `None` or `False` to disable the stripping. Returns: If :attr:`join` is True, returns a `(n-1)`-D numpy array (or list) of concatenated strings. If :attr:`join` is False, returns an `n`-D numpy array (or list) of str tokens. Example: .. code-block:: python text_ids = [[1, 9, 6, 2, 0, 0], [1, 28, 7, 8, 2, 0]] text = map_ids_to_strs(text_ids, data.vocab) # text == ['a sentence', 'parsed from ids'] text = map_ids_to_strs( text_ids, data.vocab, join=False, strip_pad=None, strip_bos=None, strip_eos=None) # text == [['<BOS>', 'a', 'sentence', '<EOS>', '<PAD>', '<PAD>'], # ['<BOS>', 'parsed', 'from', 'ids', '<EOS>', '<PAD>']] """ tokens = vocab.map_ids_to_tokens_py(ids) if isinstance(ids, (list, tuple)): tokens = tokens.tolist() if compat: tokens = compat_as_text(tokens) str_ = str_join(tokens, compat=False) str_ = strip_special_tokens(str_, strip_pad=strip_pad, strip_bos=strip_bos, strip_eos=strip_eos, compat=False) if join: return str_ else: return _recur_split(str_, ids)
def strip_special_tokens(str_, strip_pad='<PAD>', strip_bos='<BOS>', strip_eos='<EOS>', is_token_list=False, compat=True): """Removes special tokens in strings, including: - Removes EOS and all subsequent tokens - Removes leading and and trailing PAD tokens - Removes leading BOS tokens Note that besides the special tokens, all leading and trailing whitespace characters are also removed. This is a joint function of :func:`strip_eos`, :func:`strip_pad`, and :func:`strip_bos` Args: str\_: A `str`, or an `n`-D numpy array or (possibly nested) list of `str`. strip_pad (str): The PAD token to strip from the strings (i.e., remove the leading and trailing PAD tokens of the strings). Default is '<PAD>' as defined in :class:`~texar.tf.data.SpecialTokens`.PAD. Set to `None` or `False` to disable the stripping. strip_bos (str): The BOS token to strip from the strings (i.e., remove the leading BOS tokens of the strings). Default is '<BOS>' as defined in :class:`~texar.tf.data.SpecialTokens`.BOS. Set to `None` or `False` to disable the stripping. strip_eos (str): The EOS token to strip from the strings (i.e., remove the EOS tokens and all subsequent tokens of the strings). Default is '<EOS>' as defined in :class:`~texar.tf.data.SpecialTokens`.EOS. Set to `None` or `False` to disable the stripping. is_token_list (bool): Whether each sentence in :attr:`str_` is a list of tokens. If False, each sentence in :attr:`str_` is assumed to contain tokens separated with space character. compat (bool): Whether to convert tokens into `unicode` (Python 2) or `str` (Python 3). Returns: Strings of the same shape of :attr:`str_` with special tokens stripped. """ s = str_ if compat: s = compat_as_text(s) if is_token_list: s = str_join(s, compat=False) if strip_eos is not None and strip_eos is not False: s = _strip_eos_(s, strip_eos, is_token_list=False, compat=False) if strip_pad is not None and strip_pad is not False: s = strip_token(s, strip_pad, is_token_list=False, compat=False) if strip_bos is not None and strip_bos is not False: s = _strip_bos_(s, strip_bos, is_token_list=False, compat=False) if is_token_list: s = _recur_split(s, str_) return s
def corpus_bleu_moses(list_of_references, hypotheses, lowercase=False, return_all=False): """Calculates corpus-level BLEU score using the **MOSES multi-bleu.perl** script. Args: list_of_references: A list of lists of references for each hypothesis. Each reference can be either a string, or a list of string tokens. List can also be numpy array. hypotheses: A list of hypothesis sentences. Each hyperthsis can be either a string, or a list of string tokens. List can also be numpy array. lowercase (bool): If `True`, pass the "-lc" flag to the multi-bleu script. return_all (bool): If `True`, returns BLEU and all n-gram precisions. Returns: If :attr:`return_all` is `False` (default), returns a float32 BLEU score. If :attr:`return_all` is `True`, returns a list of 5 float32 scores: `[BLEU, 1-gram precision, ..., 4-gram precision]`. """ list_of_references = compat_as_text(list_of_references) hypotheses = compat_as_text(hypotheses) if np.size(hypotheses) == 0: return np.float32(0.) # pylint: disable=no-member # Get multi-bleu.perl cur_dir = os.path.dirname(os.path.realpath(__file__)) multi_bleu_path = os.path.abspath( os.path.join(cur_dir, "..", "..", "..", "bin", "utils", "multi-bleu.perl")) # Create a temporary folder containing hyperthesis and reference files result_path = tempfile.mkdtemp() # Create hyperthesis file hfile_path = os.path.join(result_path, 'hyp') hyps = [_maybe_list_to_str(h) for h in hypotheses] with open(hfile_path, 'w', encoding='utf-8') as hfile: text = "\n".join(hyps) hfile.write(text) hfile.write("\n") # Create reference files max_nrefs = max([len(refs) for refs in list_of_references]) rfile_path = os.path.join(result_path, 'ref') for rid in range(max_nrefs): with open(rfile_path + '%d' % rid, 'w', encoding='utf-8') as rfile: for refs in list_of_references: if rid < len(refs): ref = _maybe_list_to_str(refs[rid]) rfile.write(ref + "\n") else: rfile.write("\n") # Calculate BLEU multi_bleu_cmd = [multi_bleu_path] if lowercase: multi_bleu_cmd += ["-lc"] multi_bleu_cmd += [rfile_path] with open(hfile_path, "r") as hyp_input: try: multi_bleu_ret = subprocess.check_output(multi_bleu_cmd, stdin=hyp_input, stderr=subprocess.STDOUT) multi_bleu_ret = multi_bleu_ret.decode("utf-8") bleu_score = _parse_multi_bleu_ret(multi_bleu_ret, return_all) except subprocess.CalledProcessError as error: if error.output is not None: tf.logging.warning( "multi-bleu.perl returned non-zero exit code") tf.logging.warning(error.output) if return_all: bleu_score = [np.float32(0.0)] * 5 else: bleu_score = np.float32(0.0) shutil.rmtree(result_path) return np.float32(bleu_score)
def corpus_bleu(list_of_references, hypotheses, max_order=4, lowercase=False, smooth=False, return_all=True): """Computes corpus-level BLEU score. Args: list_of_references: A list of lists of references for each hypothesis. Each reference can be either a list of string tokens, or a string containing tokenized tokens separated with whitespaces. List can also be numpy array. hypotheses: A list of hypothesis sentences. Each hypothesis can be either a list of string tokens, or a string containing tokenized tokens separated with whitespaces. List can also be numpy array. lowercase (bool): If `True`, lowercase reference and hypothesis tokens. max_order (int): Maximum n-gram order to use when computing BLEU score. smooth (bool): Whether or not to apply (Lin et al. 2004) smoothing. return_all (bool): If `True`, returns BLEU and all n-gram precisions. Returns: If :attr:`return_all` is `False` (default), returns a float32 BLEU score. If :attr:`return_all` is `True`, returns a list of float32 scores: `[BLEU] + n-gram precisions`, which is of length :attr:`max_order` + 1. """ list_of_references = compat_as_text(list_of_references) hypotheses = compat_as_text(hypotheses) matches_by_order = [0] * max_order possible_matches_by_order = [0] * max_order reference_length = 0 hyperthsis_length = 0 for (references, hyperthsis) in zip(list_of_references, hypotheses): reference_length += min(len(r) for r in references) hyperthsis_length += len(hyperthsis) merged_ref_ngram_counts = collections.Counter() for reference in references: reference = _maybe_str_to_list(reference) if lowercase: reference = _lowercase(reference) merged_ref_ngram_counts |= _get_ngrams(reference, max_order) hyperthsis = _maybe_str_to_list(hyperthsis) if lowercase: hyperthsis = _lowercase(hyperthsis) hyperthsis_ngram_counts = _get_ngrams(hyperthsis, max_order) overlap = hyperthsis_ngram_counts & merged_ref_ngram_counts for ngram in overlap: matches_by_order[len(ngram) - 1] += overlap[ngram] for order in range(1, max_order + 1): possible_matches = len(hyperthsis) - order + 1 if possible_matches > 0: possible_matches_by_order[order - 1] += possible_matches precisions = [0] * max_order for i in range(0, max_order): if smooth: precisions[i] = ((matches_by_order[i] + 1.) / (possible_matches_by_order[i] + 1.)) else: if possible_matches_by_order[i] > 0: precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i]) else: precisions[i] = 0.0 if min(precisions) > 0: p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) geo_mean = math.exp(p_log_sum) else: geo_mean = 0 ratio = float(hyperthsis_length) / reference_length if ratio > 1.0: bp = 1. else: try: bp = math.exp(1 - 1. / ratio) except ZeroDivisionError: bp = math.exp(1 - 1. / (ratio + 1e-8)) bleu = geo_mean * bp if return_all: return [bleu * 100] + [p * 100 for p in precisions] else: return bleu * 100