Exemplo n.º 1
0
    def get_transcript_text(transcript: str) -> str:
        """
        Get preprocessed transcript as string

        :param transcript: Transcript string

        :return: Preprocessed transcript
        """
        return preprocess_string(transcript)
 def test_preprocess_string(self, input_data: str,
                            expected_output: str) -> None:
     """
     Tests preprocess_string function's behaviour
     :param input:           Input string
     :param expected_output: Expected output string
     :return: None
     """
     self.assertEqual(expected_output, preprocess_string(expected_output))
Exemplo n.º 3
0
    def get_google_text(google_words: List) -> str:
        """
        Get a complete text out of Google output.

        :param google_words: List of Google output words, straight from JSON object.

        :return: Google output as string
        """
        google_word_list = [
            w["word"] for w in google_words
            if not all(c in punctuation for c in w["word"])
        ]
        return preprocess_string(" ".join(google_word_list))
Exemplo n.º 4
0
    def get_google_words(google_output: object) -> List[dict]:
        """
        Preprocesses the Google output to further work with it.

        :param google_output: JSON object

        :return: List of dict for all words in a google_output.
        """
        words = []
        for result in google_output.results:
            alternative = result.alternatives[0]
            for word in alternative["words"]:
                words.append({
                    "word": preprocess_string(word["word"]),
                    "startTime": word["startTime"],
                    "endTime": word["endTime"],
                    "confidence": alternative["confidence"]
                })

        return words
Exemplo n.º 5
0
    def align_per_sentence(cls, sentences: List[Sentence],
                           transcript_alignment: str, google_alignment: str,
                           google_words: List[object],
                           alignment_parameters: Dict[str, Any],
                           alignment_score: int,
                           verbosity: int) -> List[Sentence]:
        """
        Assigns start and end times to sentences based on given alignments.

        :param sentences:            All sentences
        :param transcript_alignment: Aligned transcript
        :param google_alignment:     Aligned google output
        :param google_words:         Google words, to get startTime and endTime
        :param alignment_parameters: Dict of parameters loaded from a given YAML file. See README for full config.
        :param alignment_score:      Score of the alignment
        :param verbosity:            Verbosity of output

        :return: List of aligned sentences
        """
        last_end_point = 0
        last_end_time = 0.0

        sentence_index = 0

        for sentence in sentences:
            start_time = time()

            sentence_characters = list(preprocess_string(sentence.sentence))

            sentence_regex = "-*".join(sentence_characters)

            try:
                alignment_match = re.search(
                    sentence_regex, transcript_alignment[last_end_point:])

                alignment_start_point = last_end_point + alignment_match.start(
                )
                alignment_end_point = last_end_point + alignment_match.end()

                last_end_point = last_end_point + alignment_match.end()
            except AttributeError as e:
                bin_print(
                    0, 0,
                    "--------------------------------------------------------------------------"
                )
                bin_print(0, 0, transcript_alignment[last_end_point:])
                bin_print(0, 0, "Attribute error", e,
                          "".join(sentence_characters), sentence_regex)
                # _Shouldn't_ happen, as the regexp is basically part of the transcript we're
                # looking at. Character's don't vanish from the transcript, so there's always a match.
                cls.mark_sentence_not_appearing(sentence, alignment_parameters,
                                                last_end_time)
                last_end_time = last_end_time + alignment_parameters[
                    "no_appearance"]["interval_length"]
                continue

            # Mostly none values on either side indicates a false positive, move to beginning of sentence with
            if is_mostly_none(list(google_alignment[alignment_start_point:alignment_end_point])) \
                    or is_mostly_none(list(transcript_alignment[alignment_start_point:alignment_end_point])):
                cls.mark_sentence_not_appearing(sentence, alignment_parameters,
                                                last_end_time)
                last_end_time = last_end_time + alignment_parameters[
                    "no_appearance"]["interval_length"]
                continue

            google_sub_start = len([
                c for c in google_alignment[0:alignment_start_point]
                if c is not "-" and c is not " "
            ])
            google_sub_end = len([
                c for c in google_alignment[0:alignment_end_point]
                if c is not "-" and c is not " "
            ])

            character_count = 0
            found_start = False

            start_word_confidence = 0.0
            end_word_confidence = 0.0

            for word in google_words:
                character_count += len(preprocess_string(word["word"]))
                word_start_time = float(word["startTime"].replace("s", ""))

                # Guarantee that there's no overlapping sentences
                if character_count >= google_sub_start and last_end_time <= word_start_time and not found_start:
                    sentence.interval.start = word_start_time
                    start_word_confidence = word["confidence"]
                    found_start = True

                if found_start and character_count >= google_sub_end:
                    sentence.interval.end = float(word["endTime"].replace(
                        "s", ""))
                    last_end_time = sentence.interval.end
                    end_word_confidence = word["confidence"]
                    break

            sentence_confidence = get_sentence_confidence(
                start_word_confidence, end_word_confidence,
                transcript_alignment[
                    alignment_start_point:alignment_end_point],
                google_alignment[alignment_start_point:alignment_end_point],
                alignment_parameters["algorithm"]["match_reward"],
                alignment_parameters["algorithm"]["mismatch_penalty"],
                alignment_parameters["algorithm"]["gap_penalty"])

            google_gaps_percentage = get_none_part(
                list(
                    google_alignment[alignment_start_point:alignment_end_point]
                ))
            transcript_gaps_percentage = get_none_part(
                list(transcript_alignment[
                    alignment_start_point:alignment_end_point]))

            sentence.additional_data = AdditionalData(
                sentence_confidence["average_google_confidence"],
                sentence_confidence["normalized_sentence_score"],
                google_gaps_percentage, transcript_gaps_percentage)

            overall_score = calculate_overall_score(
                google_gaps_percentage, transcript_gaps_percentage,
                sentence_confidence["average_google_confidence"],
                sentence_confidence["normalized_sentence_score"],
                alignment_parameters["score_weights"]["gaps_google"],
                alignment_parameters["score_weights"]["gaps_transcript"],
                alignment_parameters["score_weights"]["alignment_score"],
                alignment_parameters["score_weights"]["google_confidence"])

            if overall_score > alignment_parameters["filtering"]["threshold"]:
                if alignment_parameters["filtering"]["method"] == "mark":
                    sentence.sentence = "[BAD]" + sentence.sentence
                    sentence_index += 1
                else:
                    del (sentences[sentence_index])
            else:
                sentence_index += 1

            end_time = time()
            cls.execution_times.append(end_time - start_time)

            bin_print(verbosity, 2, "Sentence confidence:",
                      str(sentence_confidence))

        return sentences