예제 #1
0
def strip_token(str_, token, is_token_list=False, compat=True):
    """Returns a copy of strings with leading and trailing tokens removed.

    Note that besides :attr:`token`, all leading and trailing whitespace
    characters are also removed.

    If :attr:`is_token_list` is False, then the function assumes tokens in
    :attr:`str_` are separated with whitespace character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        token (str): The token to strip, e.g., the '<PAD>' token defined in
            :class:`~texar.tf.data.SpecialTokens`.PAD
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        The stripped strings of the same structure/shape as :attr:`str_`.

    Example:

        .. code-block:: python

            str_ = '<PAD> a sentence <PAD> <PAD>  '
            str_stripped = strip_token(str_, '<PAD>')
            # str_stripped == 'a sentence'

            str_ = ['<PAD>', 'a', 'sentence', '<PAD>', '<PAD>', '', '']
            str_stripped = strip_token(str_, '<PAD>', is_token_list=True)
            # str_stripped == 'a sentence'
    """
    def _recur_strip(s):
        if is_str(s):
            if token == "":
                return ' '.join(s.strip().split())
            else:
                return ' '.join(s.strip().split()).\
                    replace(' '+token, '').replace(token+' ', '')
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    strp_str = _recur_strip(s)

    if is_token_list:
        strp_str = _recur_split(strp_str, str_)

    return strp_str
예제 #2
0
def str_join(tokens, sep=' ', compat=True):
    """Concats :attr:`tokens` along the last dimension with intervening
    occurrences of :attr:`sep`.

    Args:
        tokens: An `n`-D numpy array or (possibly nested) list of `str`.
        sep (str): The string intervening between the tokens.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        An `(n-1)`-D numpy array (or list) of `str`.
    """
    def _recur_join(s):
        if len(s) == 0:
            return ''
        elif is_str(s[0]):
            return sep.join(s)
        else:
            s_ = [_recur_join(si) for si in s]
            return _maybe_list_to_array(s_, s)

    if compat:
        tokens = compat_as_text(tokens)

    str_ = _recur_join(tokens)

    return str_
예제 #3
0
def strip_bos(str_, bos_token='<BOS>', is_token_list=False, compat=True):
    """Remove all leading BOS tokens.

    Note that besides :attr:`bos_token`, all leading and trailing whitespace
    characters are also removed.

    If :attr:`is_token_list` is False, then the function assumes tokens in
    :attr:`str_` are separated with whitespace character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        bos_token (str): The BOS token. Default is '<BOS>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.BOS
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same structure/shape as :attr:`str_`.
    """
    def _recur_strip(s):
        if is_str(s):
            if bos_token == '':
                return ' '.join(s.strip().split())
            else:
                return ' '.join(s.strip().split()).replace(bos_token + ' ', '')
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    strp_str = _recur_strip(s)

    if is_token_list:
        strp_str = _recur_split(strp_str, str_)

    return strp_str
예제 #4
0
def strip_eos(str_, eos_token='<EOS>', is_token_list=False, compat=True):
    """Remove the EOS token and all subsequent tokens.

    If :attr:`is_token_list` is False, then the function assumes tokens in
    :attr:`str_` are separated with whitespace character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        eos_token (str): The EOS token. Default is '<EOS>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.EOS
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same structure/shape as :attr:`str_`.
    """
    def _recur_strip(s):
        if is_str(s):
            s_tokens = s.split()
            if eos_token in s_tokens:
                return ' '.join(s_tokens[:s_tokens.index(eos_token)])
            else:
                return s
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    strp_str = _recur_strip(s)

    if is_token_list:
        strp_str = _recur_split(strp_str, str_)

    return strp_str
예제 #5
0
def map_ids_to_strs(ids,
                    vocab,
                    join=True,
                    strip_pad='<PAD>',
                    strip_bos='<BOS>',
                    strip_eos='<EOS>',
                    compat=True):
    """Transforms `int` indexes to strings by mapping ids to tokens,
    concatenating tokens into sentences, and stripping special tokens, etc.

    Args:
        ids: An n-D numpy array or (possibly nested) list of `int` indexes.
        vocab: An instance of :class:`~texar.tf.data.Vocab`.
        join (bool): Whether to concat along the last dimension of the
            the tokens into a string separated with a space character.
        strip_pad (str): The PAD token to strip from the strings (i.e., remove
            the leading and trailing PAD tokens of the strings). Default
            is '<PAD>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.PAD.
            Set to `None` or `False` to disable the stripping.
        strip_bos (str): The BOS token to strip from the strings (i.e., remove
            the leading BOS tokens of the strings).
            Default is '<BOS>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.BOS.
            Set to `None` or `False` to disable the stripping.
        strip_eos (str): The EOS token to strip from the strings (i.e., remove
            the EOS tokens and all subsequent tokens of the strings).
            Default is '<EOS>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.EOS.
            Set to `None` or `False` to disable the stripping.

    Returns:
        If :attr:`join` is True, returns a `(n-1)`-D numpy array (or list) of
        concatenated strings. If :attr:`join` is False, returns an `n`-D numpy
        array (or list) of str tokens.

    Example:

        .. code-block:: python

            text_ids = [[1, 9, 6, 2, 0, 0], [1, 28, 7, 8, 2, 0]]

            text = map_ids_to_strs(text_ids, data.vocab)
            # text == ['a sentence', 'parsed from ids']

            text = map_ids_to_strs(
                text_ids, data.vocab, join=False,
                strip_pad=None, strip_bos=None, strip_eos=None)
            # text == [['<BOS>', 'a', 'sentence', '<EOS>', '<PAD>', '<PAD>'],
            #          ['<BOS>', 'parsed', 'from', 'ids', '<EOS>', '<PAD>']]
    """
    tokens = vocab.map_ids_to_tokens_py(ids)
    if isinstance(ids, (list, tuple)):
        tokens = tokens.tolist()

    if compat:
        tokens = compat_as_text(tokens)

    str_ = str_join(tokens, compat=False)

    str_ = strip_special_tokens(str_,
                                strip_pad=strip_pad,
                                strip_bos=strip_bos,
                                strip_eos=strip_eos,
                                compat=False)

    if join:
        return str_
    else:
        return _recur_split(str_, ids)
예제 #6
0
def strip_special_tokens(str_,
                         strip_pad='<PAD>',
                         strip_bos='<BOS>',
                         strip_eos='<EOS>',
                         is_token_list=False,
                         compat=True):
    """Removes special tokens in strings, including:

        - Removes EOS and all subsequent tokens
        - Removes leading and and trailing PAD tokens
        - Removes leading BOS tokens

    Note that besides the special tokens, all leading and trailing whitespace
    characters are also removed.

    This is a joint function of :func:`strip_eos`, :func:`strip_pad`, and
    :func:`strip_bos`

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        strip_pad (str): The PAD token to strip from the strings (i.e., remove
            the leading and trailing PAD tokens of the strings). Default
            is '<PAD>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.PAD.
            Set to `None` or `False` to disable the stripping.
        strip_bos (str): The BOS token to strip from the strings (i.e., remove
            the leading BOS tokens of the strings).
            Default is '<BOS>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.BOS.
            Set to `None` or `False` to disable the stripping.
        strip_eos (str): The EOS token to strip from the strings (i.e., remove
            the EOS tokens and all subsequent tokens of the strings).
            Default is '<EOS>' as defined in
            :class:`~texar.tf.data.SpecialTokens`.EOS.
            Set to `None` or `False` to disable the stripping.
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same shape of :attr:`str_` with special tokens stripped.
    """
    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    if strip_eos is not None and strip_eos is not False:
        s = _strip_eos_(s, strip_eos, is_token_list=False, compat=False)

    if strip_pad is not None and strip_pad is not False:
        s = strip_token(s, strip_pad, is_token_list=False, compat=False)

    if strip_bos is not None and strip_bos is not False:
        s = _strip_bos_(s, strip_bos, is_token_list=False, compat=False)

    if is_token_list:
        s = _recur_split(s, str_)

    return s
예제 #7
0
def corpus_bleu_moses(list_of_references,
                      hypotheses,
                      lowercase=False,
                      return_all=False):
    """Calculates corpus-level BLEU score using the
    **MOSES multi-bleu.perl** script.

    Args:
        list_of_references: A list of lists of references for each hypothesis.
            Each reference can be either a string, or a list of string tokens.
            List can also be numpy array.
        hypotheses: A list of hypothesis sentences.
            Each hyperthsis can be either a string, or a list of string tokens.
            List can also be numpy array.
        lowercase (bool): If `True`, pass the "-lc" flag to the multi-bleu
            script.
        return_all (bool): If `True`, returns BLEU and all n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a float32
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of 5 float32 scores:
        `[BLEU, 1-gram precision, ..., 4-gram precision]`.
    """
    list_of_references = compat_as_text(list_of_references)
    hypotheses = compat_as_text(hypotheses)

    if np.size(hypotheses) == 0:
        return np.float32(0.)  # pylint: disable=no-member

    # Get multi-bleu.perl
    cur_dir = os.path.dirname(os.path.realpath(__file__))
    multi_bleu_path = os.path.abspath(
        os.path.join(cur_dir, "..", "..", "..", "bin", "utils",
                     "multi-bleu.perl"))

    # Create a temporary folder containing hyperthesis and reference files
    result_path = tempfile.mkdtemp()
    # Create hyperthesis file
    hfile_path = os.path.join(result_path, 'hyp')
    hyps = [_maybe_list_to_str(h) for h in hypotheses]
    with open(hfile_path, 'w', encoding='utf-8') as hfile:
        text = "\n".join(hyps)
        hfile.write(text)
        hfile.write("\n")
    # Create reference files
    max_nrefs = max([len(refs) for refs in list_of_references])
    rfile_path = os.path.join(result_path, 'ref')
    for rid in range(max_nrefs):
        with open(rfile_path + '%d' % rid, 'w', encoding='utf-8') as rfile:
            for refs in list_of_references:
                if rid < len(refs):
                    ref = _maybe_list_to_str(refs[rid])
                    rfile.write(ref + "\n")
                else:
                    rfile.write("\n")

    # Calculate BLEU
    multi_bleu_cmd = [multi_bleu_path]
    if lowercase:
        multi_bleu_cmd += ["-lc"]
    multi_bleu_cmd += [rfile_path]
    with open(hfile_path, "r") as hyp_input:
        try:
            multi_bleu_ret = subprocess.check_output(multi_bleu_cmd,
                                                     stdin=hyp_input,
                                                     stderr=subprocess.STDOUT)
            multi_bleu_ret = multi_bleu_ret.decode("utf-8")
            bleu_score = _parse_multi_bleu_ret(multi_bleu_ret, return_all)
        except subprocess.CalledProcessError as error:
            if error.output is not None:
                tf.logging.warning(
                    "multi-bleu.perl returned non-zero exit code")
                tf.logging.warning(error.output)
            if return_all:
                bleu_score = [np.float32(0.0)] * 5
            else:
                bleu_score = np.float32(0.0)

    shutil.rmtree(result_path)

    return np.float32(bleu_score)
예제 #8
0
def corpus_bleu(list_of_references,
                hypotheses,
                max_order=4,
                lowercase=False,
                smooth=False,
                return_all=True):
    """Computes corpus-level BLEU score.

    Args:
        list_of_references: A list of lists of references for each hypothesis.
            Each reference can be either a list of string tokens, or a string
            containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        hypotheses: A list of hypothesis sentences.
            Each hypothesis can be either a list of string tokens, or a
            string containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        lowercase (bool): If `True`, lowercase reference and hypothesis tokens.
        max_order (int): Maximum n-gram order to use when computing BLEU score.
        smooth (bool): Whether or not to apply (Lin et al. 2004) smoothing.
        return_all (bool): If `True`, returns BLEU and all n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a float32
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of float32 scores:
        `[BLEU] + n-gram precisions`, which is of length :attr:`max_order` + 1.
    """
    list_of_references = compat_as_text(list_of_references)
    hypotheses = compat_as_text(hypotheses)

    matches_by_order = [0] * max_order
    possible_matches_by_order = [0] * max_order
    reference_length = 0
    hyperthsis_length = 0
    for (references, hyperthsis) in zip(list_of_references, hypotheses):
        reference_length += min(len(r) for r in references)
        hyperthsis_length += len(hyperthsis)

        merged_ref_ngram_counts = collections.Counter()
        for reference in references:
            reference = _maybe_str_to_list(reference)
            if lowercase:
                reference = _lowercase(reference)
            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)

        hyperthsis = _maybe_str_to_list(hyperthsis)
        if lowercase:
            hyperthsis = _lowercase(hyperthsis)
        hyperthsis_ngram_counts = _get_ngrams(hyperthsis, max_order)

        overlap = hyperthsis_ngram_counts & merged_ref_ngram_counts
        for ngram in overlap:
            matches_by_order[len(ngram) - 1] += overlap[ngram]
        for order in range(1, max_order + 1):
            possible_matches = len(hyperthsis) - order + 1
            if possible_matches > 0:
                possible_matches_by_order[order - 1] += possible_matches

    precisions = [0] * max_order
    for i in range(0, max_order):
        if smooth:
            precisions[i] = ((matches_by_order[i] + 1.) /
                             (possible_matches_by_order[i] + 1.))
        else:
            if possible_matches_by_order[i] > 0:
                precisions[i] = (float(matches_by_order[i]) /
                                 possible_matches_by_order[i])
            else:
                precisions[i] = 0.0

    if min(precisions) > 0:
        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
        geo_mean = math.exp(p_log_sum)
    else:
        geo_mean = 0

    ratio = float(hyperthsis_length) / reference_length

    if ratio > 1.0:
        bp = 1.
    else:
        try:
            bp = math.exp(1 - 1. / ratio)
        except ZeroDivisionError:
            bp = math.exp(1 - 1. / (ratio + 1e-8))

    bleu = geo_mean * bp

    if return_all:
        return [bleu * 100] + [p * 100 for p in precisions]
    else:
        return bleu * 100