Exemplo n.º 1
0
    def score_line(self, line, vocabulary):
        """Scores a line of text.

        Start-of-sentence and end-of-sentece tags (``<s>`` and ``</s>``) will be
        inserted at the beginning and the end of the line, if they're missing.
        If the line is empty, ``None`` will be returned, instead of interpreting
        it as the empty sentence ``<s> </s>``.

        :type line: str
        :param line: a sequence of words

        :type vocabulary: Vocabulary
        :param vocabulary: vocabulary for converting the words to word IDs

        :rtype: float
        :returns: log probability of the word sequence, or None if the line is
                  empty
        """

        words = utterance_from_line(line)
        if not words:
            return None

        word_ids = vocabulary.words_to_ids(words)
        unk_id = vocabulary.word_to_id['<unk>']
        self.num_words += word_ids.size
        self.num_unks += numpy.count_nonzero(word_ids == unk_id)

        class_ids = [
            vocabulary.word_id_to_class_id[word_id] for word_id in word_ids
        ]
        probs = [vocabulary.get_word_prob(word_id) for word_id in word_ids]

        return self.score_sequence(word_ids, class_ids, probs)
Exemplo n.º 2
0
    def __init__(self, input_files, vocabulary=None, count_type='int32'):
        """Reads word statistics from corpus file and creates the
        ``unigram_counts`` and ``bigram_counts`` attributes.

        Leaves the input files pointing to the beginning of the file.

        :type input_files: list of file or mmap objects
        :param input_files: input text files

        :type vocabulary: Vocabulary
        :param vocabulary: restrict to these words
        """

        vocabulary_size = vocabulary.num_words()
        unk_id = vocabulary.word_to_id['<unk>']

        self.unigram_counts = numpy.zeros(vocabulary_size, count_type)
        self.bigram_counts = dok_matrix((vocabulary_size, vocabulary_size),
                                        dtype=count_type)

        for subset_file in input_files:
            for line in subset_file:
                sequence = []
                for word in utterance_from_line(line):
                    if word in vocabulary:
                        sequence.append(vocabulary.word_to_id[word])
                    else:
                        sequence.append(unk_id)
                for word_id in sequence:
                    self.unigram_counts[word_id] += 1
                for left_word_id, right_word_id in zip(sequence[:-1],
                                                       sequence[1:]):
                    self.bigram_counts[left_word_id, right_word_id] += 1
            subset_file.seek(0)
Exemplo n.º 3
0
    def score_line(self, line, vocabulary):
        """Scores a line of text.

        Start-of-sentence and end-of-sentece tags (``<s>`` and ``</s>``) will be
        inserted at the beginning and the end of the line, if they're missing.
        If the line is empty, ``None`` will be returned, instead of interpreting
        it as the empty sentence ``<s> </s>``.

        ``<unk>`` tokens will be excluded from the probability computation, if
        the constructor was given ``exclude_unk=True``. When using a shortlist,
        OOV words are always excluded, and if ``exclude_unk=True`` was given,
        OOS words are also excluded. Words with zero class membership
        probability are always excluded.

        :type line: str
        :param line: a sequence of words

        :type vocabulary: Vocabulary
        :param vocabulary: vocabulary for converting the words to word IDs

        :rtype: float
        :returns: log probability of the word sequence, or None if the line is
                  empty
        """

        words = utterance_from_line(line)
        if not words:
            return None

        word_ids = vocabulary.words_to_ids(words)
        unk_id = vocabulary.word_to_id['<unk>']
        self.num_words += word_ids.size
        self.num_unks += numpy.count_nonzero(word_ids == unk_id)

        class_ids = [
            vocabulary.word_id_to_class_id[word_id] for word_id in word_ids
        ]
        probs = [vocabulary.get_word_prob(word_id) for word_id in word_ids]

        return self.score_sequence(word_ids, class_ids, probs)
Exemplo n.º 4
0
def compute_word_counts(input_files):
    """Computes word unigram counts using word strings.

    This method does not expect a vocabulary. Start and end of sentence markers
    are not added. Leaves the input files pointing to the beginning of the file.

    :type input_files: list of file or mmap objects
    :param input_files: input text files

    :rtype: dict
    :returns: a mapping from word strings to counts
    """

    result = dict()
    for subset_file in input_files:
        for line in subset_file:
            for word in utterance_from_line(line):
                if word not in result:
                    result[word] = 1
                else:
                    result[word] += 1
        subset_file.seek(0)
    return result
Exemplo n.º 5
0
    def compute_probs(self, input_files):
        """Recomputes unigram class membership probabilities from text files.
        Probabilities are updates only for classes whose words occur in the
        text.

        Ensures that special tokens will always have nonzero probabilities.

        :type input_files: list of file or mmap objects
        :param input_files: input text files
        """

        counts = numpy.zeros(self.num_words(), dtype='int64')
        for subset_file in input_files:
            for line in subset_file:
                for word in utterance_from_line(line):
                    if word in self.word_to_id:
                        counts[self.word_to_id[word]] += 1

        sos_id = self.word_to_id['<s>']
        eos_id = self.word_to_id['</s>']
        unk_id = self.word_to_id['<unk>']
        counts[sos_id] = max(counts[sos_id], 1)
        counts[eos_id] = max(counts[eos_id], 1)
        counts[unk_id] = max(counts[unk_id], 1)

        for cls in self._word_classes:
            cls_counts = dict()
            for word_id, _ in cls:
                cls_counts[word_id] = counts[word_id]
            cls_total = sum(cls_counts.values())
            if cls_total > 0:
                for word_id, count in cls_counts.items():
                    cls.set_prob(word_id, float(count) / cls_total)
            else:
                prob = 1.0 / len(cls)
                for word_id, _ in cls:
                    cls.set_prob(word_id, prob)
Exemplo n.º 6
0
def _score_utterances(input_file, vocabulary, scorer, output_file,
                      log_base=None):
    """Reads utterances from ``input_file``, computes LM scores using
    ``scorer``, and writes one score per line to ``output_file``.

    Start-of-sentence and end-of-sentece tags (``<s>`` and ``</s>``) will be
    inserted at the beginning and the end of each utterance, if they're missing.
    Empty lines will be ignored, instead of interpreting them as the empty
    sentence ``<s> </s>``.

    :type input_file: file object
    :param input_file: a file that contains the input sentences in SRILM n-best
                       format

    :type vocabulary: Vocabulary
    :param vocabulary: vocabulary that provides mapping between words and word
                       IDs

    :type scorer: TextScorer
    :param scorer: a text scorer for rescoring the input sentences

    :type output_file: file object
    :param output_file: a file where to write the output n-best list in SRILM
                        format

    :type log_base: int
    :param log_base: if set to other than None, convert log probabilities to
                     this base
    """

    log_scale = 1.0 if log_base is None else numpy.log(log_base)

    unk_id = vocabulary.word_to_id['<unk>']
    num_words = 0
    num_unks = 0
    for line_num, line in enumerate(input_file):
        words = utterance_from_line(line)
        if not words:
            continue

        word_ids = vocabulary.words_to_ids(words)
        num_words += word_ids.size
        num_unks += numpy.count_nonzero(word_ids == unk_id)
        class_ids = [vocabulary.word_id_to_class_id[word_id]
                     for word_id in word_ids]
        probs = [vocabulary.get_word_prob(word_id)
                 for word_id in word_ids]

        lm_score = scorer.score_sequence(word_ids, class_ids, probs)
        lm_score /= log_scale
        output_file.write(str(lm_score) + '\n')

        if (line_num + 1) % 1000 == 0:
            print("{0} sentences scored.".format(line_num + 1))
        sys.stdout.flush()

    if num_words == 0:
        print("The input file contains no words.")
    else:
        print("{0} words processed, including start-of-sentence and "
              "end-of-sentence tags, and {1} ({2:.1f} %) out-of-vocabulary "
              "words".format(num_words, num_unks, num_unks / num_words))