def align_files(self, input_path: str, output_path: str, verbosity: int) -> None:
        """
        Aligns all given files in input_path and writes alignments into output_path

        :param input_path:  Where to look for transcript files
        :param output_path: Where to write alignment files
        :param verbosity:   Verbosity of debugging output

        :return: None
        """
        bin_print(verbosity, 1, "Reading files from", input_path)

        files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and f.split(".")[1] == "wav"]
        bin_print(verbosity, 2, "WAVE files found:", "\n    -", "\n    - ".join(files))

        file_pairs = [(f, f + ".txt", f + ".wav") for f in [input_path + f.split(".")[0] for f in files]]

        for file_pair in file_pairs:
            bin_print(verbosity, 2, "Creating alignment for " + file_pair[0] + ".*")
            alignment = self.aligner.align(file_pair[1], file_pair[2], verbosity)

            output_filename = file_pair[0] + "_audacity_" + self.alignment_type + ".txt"
            with open(output_filename, "w+", encoding="utf-8") as f:
                f.write("\n".join([sentence.to_audacity_label_format() for sentence in alignment]))
                bin_print(verbosity, 2, "Wrote " + output_filename)
                f.close()

        bin_print(verbosity, 1, "Writing files to", output_path)
Exemplo n.º 2
0
def main(argv: list) -> None:
    title = "Get Google recognition"
    description = """
Gets the Speech Recognition result of Google Cloud API and stores it in a caching folder.

Usage:
    python get_google_recognition_raw.py --path=<path> --authpath=<path> --bucket=<bucket name> --outpath=<path> [-v|-vv|-vvv]

Args:
    --path:      Path to read transcript files from (needed to filter which files to actually transcript)
    --authpath:  Path containing the authentication files necessary to connect to Google Cloud API services
    --bucket:    Name of the bucket containing all FLAC files
    --outpath:   Path to write the raw JSON output to
    -v|-vv|-vvv: Verbosity level of the output
    -h:          Prints this help
        """
    args = ["path=", "config="]

    input_args = intro(title, description, args, argv)

    input_args = intro("Get Google recognition raw",
                       "Gets the Speech Recognition result of Google Cloud API and stores it in a caching folder.\n\nget_google_recognition_raw.py --path=<path> --authpath=<path> --bucket=<bucket name> --outpath=<path> [-v|-vv|-vvv]",
                       ["path=", "authpath=", "outpath=", "bucket="], argv)

    # Authenticate globally with specified client JSON
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = input_args["authpath"]

    start = time.time()
    get_and_save_raw(input_args["path"], input_args["bucket"], input_args["outpath"], input_args["verbosity"])
    end = time.time()

    bin_print(input_args["verbosity"], 0, "Done.")
    bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
Exemplo n.º 3
0
def main(argv: list) -> None:
    title = "Fix hand alignments"
    description = """
Fix hand alignments: Reshuffle training data and/or assign `-` to nonexisting sentences.

Usage:
    python fix_hand_alignments.py --path=<path> [-v|-vv|-vvv] [--fix-nonexisting] [--reshuffle-training]

Args:
    --path:               Path to read alignment data
    -v|-vv|-vvv:          Verbosity level of the output
    --fix-nonexisting:    If non-existing sentences should be marked with `-` for interval start and end points
    --reshuffle-training: Select a new 70% of all sentences as training data
    -h:                   Prints this help
        """
    args = ["path=", "config=", "fix-nonexisting", "reshuffle-training"]
    input_args = intro(title, description, args, argv)

    input_args[
        "fix-nonexisting"] = True if "with-list" in input_args else False
    input_args[
        "reshuffle-training"] = True if "get-low-means" in input_args else False

    start = time.time()

    fix_hand_alignments(input_args["path"], input_args["fix-nonexisting"],
                        input_args["reshuffle-training"],
                        input_args["verbosity"])
    end = time.time()

    bin_print(input_args["verbosity"], 0, "Done.")
    bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
Exemplo n.º 4
0
    def perform_alignment(transcript: str, wav_path: str,
                          verbosity: int) -> List[Sentence]:
        """
        Performs the alignment

        :param transcript: Transcript path
        :param wav_path:   Path to wav file
        :param verbosity:  Verbosity level

        :return: Aligned sentences
        """
        start_time = time.time()
        audio_segment = AudioSegment.from_wav(wav_path)
        duration = audio_segment.duration_seconds

        with open(transcript, encoding="utf-8") as f:
            transcript = f.read()
        transcript = transcript.replace("\n", " ")

        sentences = transcript_to_sentences(transcript)

        precision = 1_000_000
        borders = random.sample(range(0, precision), len(sentences) * 2)
        borders.sort()
        borders = [border / precision * duration for border in borders]

        bin_print(verbosity, 3, "Borders for", wav_path, "are", borders)

        index = 0
        for sentence in sentences:
            sentence.interval.start = borders[index]
            sentence.interval.end = borders[index + 1]
            index += 2

        end_time = time.time()

        bin_print(verbosity, 2, "Time elapsed for", wav_path, ":",
                  (end_time - start_time))

        return sentences
def main(argv: list) -> None:
    title = "Create alignment"
    description = """
Creates an alignment based on configuration. See README.md for setting up a correct configuration.

Usage:
    python create_alignment.py --path=<path> --config=<path> [-v|-vv|-vvv]
    
Args:
    --path:      Path to read raw data from and write alignments to
    --config:    Path to configuration
    -v|-vv|-vvv: Verbosity level of the output
    -h:          Prints this help
    """
    args = ["path=", "config="]

    input_args = intro(title, description, args, argv)

    start = time.time()
    config = load_config(input_args["config"])

    bin_print(input_args["verbosity"], 2, "Loaded configuration: ", config)

    aligner = get_aligner(config)

    aligner.align_files(input_args["path"], input_args["path"],
                        input_args["verbosity"])

    end = time.time()

    bin_print(input_args["verbosity"], 0, "Done.")
    bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
Exemplo n.º 6
0
    def optimize_function(params: List) -> float:
        """
        Function to optimize against

        :param params: Parameters given by BOpt

        :return: Calculated score
        """
        bin_print(verbosity, 2, "Parameters: ", params)

        alignment_parameters["score_weights"]["gaps_google"] = params[0][0]
        alignment_parameters["score_weights"]["gaps_transcript"] = params[0][1]
        alignment_parameters["score_weights"]["alignment_score"] = params[0][2]
        alignment_parameters["score_weights"]["google_confidence"] = params[0][
            3]

        results = compare_alignments(input_path, 0, "hand", "google", True,
                                     alignment_parameters)

        correlation_ious = pearsonr_lists(
            results["scores"]["ious"]["all"],
            results["scores"]["calculated"]["all"])
        correlation_deviation = pearsonr_lists(
            results["scores"]["deviation"]["all"],
            results["scores"]["calculated"]["all"])

        bin_print(verbosity, 1, "Correlation IOUs: ", correlation_ious)
        bin_print(verbosity, 1, "Correlation deviation: ",
                  correlation_deviation)

        # Only maximize correlation with IOU
        return abs(correlation_ious)
Exemplo n.º 7
0
    def align(cls, google_output: object, transcript: str, verbosity: int,
              alignment_parameters: Dict[str, Any]) -> List["Sentence"]:
        """
        Adjusted way of actually aligning with Google output.

        :param transcript:           Transcript as string
        :param google_output:        Google output as JSON object
        :param verbosity:            Verbosity of output
        :param alignment_parameters: Dict of parameters loaded from a given YAML file. See README for full config.

        :return: List of aligned sentences
        """
        sentences = cls.get_sentences(transcript)
        transcript_text = cls.get_transcript_text(transcript)

        google_words = cls.get_google_words(google_output)
        google_text = cls.get_google_text(google_words)

        base_confidences = [
            r.alternatives[0]["confidence"] for r in google_output.results
        ]
        bin_print(verbosity, 2, "Confidences of all results:",
                  base_confidences)

        bin_print(verbosity, 3, "Preprocessed transcript text:",
                  transcript_text)
        bin_print(verbosity, 3, "Preprocessed google text:", google_text)

        start_time = time()
        # Call actual implementation of the alignment.
        alignment = cls.perform_alignment(transcript_text, google_text,
                                          verbosity, alignment_parameters)
        end_time = time()

        cls.alignment_times.append((end_time - start_time) / len(sentences))

        google_alignment = alignment["google"]
        transcript_alignment = alignment["transcript"]
        alignment_score = alignment["score"]

        bin_print(verbosity, 3,
                  prettify_alignment(google_alignment, transcript_alignment))

        return cls.align_per_sentence(sentences, transcript_alignment,
                                      google_alignment, google_words,
                                      alignment_parameters, alignment_score,
                                      verbosity)
    def optimize_function(params: List) -> float:
        """
        Function to optimize against

        :param params: Parameters given by BOpt

        :return: Calculated score
        """
        bin_print(verbosity, 1, "Starting new iteration...")

        google_files_aligner.alignment_parameters["algorithm"][
            "match_reward"] = params[0][0]
        google_files_aligner.alignment_parameters["algorithm"][
            "mismatch_penalty"] = params[0][1]
        google_files_aligner.alignment_parameters["algorithm"][
            "gap_penalty"] = params[0][2]

        bin_print(verbosity, 3, "Configured params: ",
                  google_files_aligner.alignment_parameters)

        google_files_aligner.align_files(input_path, output_path, 0)

        # Not "training_only", because we're using a further boiled down training set.
        result = compare_alignments(input_path, 0, "hand", "google", False,
                                    alignment_parameters)

        # Configurable, see config.example.yml
        score = eval(
            google_files_aligner.
            alignment_parameters["optimize_params_formula"],
            {"__builtins__": None}, {
                "deviation": result["scores"]["deviation"]["mean"],
                "iou": result["ious"]["mean"],
                "f1": result["appearance"]["f1_score"],
                "precision": result["appearance"]["precision"],
                "recall": result["appearance"]["recall"],
            })

        bin_print(verbosity, 1, "Parameters:                         ", params)
        bin_print(verbosity, 1, "Achieved score (smaller == better): ", score)

        return score
def main(argv: list) -> None:
    title = "Optimize alignments"
    description = """
Tries to find the best alignment parameters based on Bayesian optimization.

Usage:
    python optimize_parameters.py --path=<path> --config=<path> --convergence-plot-file=<path> [-v|-vv|-vvv]

Args:
    --path:                  Path to read alignment data from
    --config:                Path to configuration
    --convergence-plot-file: Filename for the plot of the convergence, PNG
    -v|-vv|-vvv:             Verbosity level of the output
    -h:                      Prints this help
    """
    args = ["path=", "config=", "convergence-plot-file="]

    input_args = intro(title, description, args, argv)

    start = time.time()
    config = load_config(input_args["config"])

    bin_print(input_args["verbosity"], 2, "Loaded configuration: ", config)

    aligner = get_aligner(config)

    optimize_parameters(
        input_args["path"],
        input_args["path"],
        aligner,
        config,
        input_args["convergence-plot-file"],
        input_args["verbosity"]
    )

    end = time.time()

    bin_print(input_args["verbosity"], 0, "Done.")
    bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
def main(argv: list) -> None:
    title = "Compare alignments"
    description = """
Compares two kinds of alignments

Usage:
    python compare_alignment.py --path=<path> --type1=basic,hand,random,google --type2=basic,hand,random,google --config=<path> [-v|-vv|-vvv] [--with-list] [--get-low-means] [--training-only]

Args:
    --path:          Path to read alignment data
    --type1:         First type to compare, one of basic, hand, random or google
    --type2:         Second type to compare, one of basic, hand, random or google
    --config         Path to config file
    -v|-vv|-vvv:     Verbosity level of the output
    --with-list:     Include a list with all calculated IOUs for copy/paste (to use in an EXCEL sheet, for example)
    --get-low-means: Includes a list of wav files with a mean IOU < 0.3, for debugging purposes
    --training-only: Only ever compares sentences marked with [TRAINING] in the first type of the alignment
    -h:              Prints this help
    """
    args = [
        "path=", "type1=", "type2=", "config=", "with-list", "get-low-means",
        "training-only"
    ]
    input_args = intro(title, description, args, argv)

    config = load_config(input_args["config"])

    input_args["with-list"] = True if "with-list" in input_args else False
    input_args[
        "get-low-means"] = True if "get-low-means" in input_args else False
    input_args[
        "training-only"] = True if "training-only" in input_args else False

    start = time.time()
    results = compare_alignments(input_args["path"], input_args["verbosity"],
                                 input_args["type1"], input_args["type2"],
                                 input_args["training-only"], config)

    verbosity = input_args["verbosity"]

    for file in results["ious"]["per_file"].items():
        bin_print(verbosity, 3, "IOUs for", file[0], ":", file[1]["all"])
        bin_print(
            verbosity, 0, file[0],
            ", " + input_args["type1"] + " vs. " + input_args["type2"] + ":")
        bin_print(verbosity, 0, " - Mean IOU:   ", file[1]["mean"])
        bin_print(verbosity, 0, " - Median IOU: ", file[1]["median"])

    bin_print(verbosity, 3, "All IOUs:", results["ious"]["all"])
    bin_print(verbosity, 0, "========")
    bin_print(verbosity, 0,
              input_args["type1"] + " vs. " + input_args["type2"] + ":")
    bin_print(verbosity, 0, "Total number of sentences:",
              results["no_sentences"]["total"])
    bin_print(verbosity, 0, "--------")
    bin_print(verbosity, 0, "IOU")
    bin_print(verbosity, 0, " - Mean IOU:   ", results["ious"]["mean"])
    bin_print(verbosity, 0, " - Median IOU: ", results["ious"]["median"])
    bin_print(verbosity, 0, "--------")
    bin_print(verbosity, 0, "Deviation (absolute)")
    bin_print(verbosity, 0, " - Mean deviation:   ",
              results["scores"]["deviation"]["mean"])
    bin_print(verbosity, 0, " - Median deviation: ",
              results["scores"]["deviation"]["median"])
    bin_print(verbosity, 0, "--------")
    bin_print(verbosity, 0, "Calculated score")
    bin_print(verbosity, 0, " - Mean calculated score:   ",
              np.mean(results["scores"]["calculated"]["all"]))
    bin_print(verbosity, 0, " - Median calculated score: ",
              np.median(results["scores"]["calculated"]["all"]))
    bin_print(verbosity, 0, "--------")
    bin_print(verbosity, 0, "Number of sentences appearing: ",
              results["no_sentences"]["appearing"])

    tPrecisionRecall = PrettyTable()
    tPrecisionRecall.field_names = [
        "", "Condition positive", "Condition negative"
    ]
    tPrecisionRecall.add_row([
        "Predicted positive", results["appearance"]["true_positives"],
        results["appearance"]["false_positives"]
    ])
    tPrecisionRecall.add_row([
        "Predicted negative", results["appearance"]["false_negatives"],
        results["appearance"]["true_negatives"]
    ])

    bin_print(verbosity, 0, "Sentences appearing")
    bin_print(verbosity, 0, "\n" + str(tPrecisionRecall))
    bin_print(verbosity, 0, "Precision: ", results["appearance"]["precision"])
    bin_print(verbosity, 0, "Recall:    ", results["appearance"]["recall"])
    bin_print(verbosity, 0, "F1 score:  ", results["appearance"]["f1_score"])

    if input_args["with-list"]:
        bin_print(verbosity, 0, "Outputting all values as copy/pastable list:")
        print("\n".join([
            str(v[0]) for v in [v for v in results["ious"]["all"]]
            if v[0] <= 1.0
        ]))

    if input_args["get-low-means"]:
        bin_print(
            verbosity, 0,
            "Outputting copy/pastable list of low (<0.3) mean IOU files:")
        print(results["ious"]["low"])

    t_pearson = PrettyTable()
    t_pearson.field_names = [
        "", "IOU", "Deviation", "Alignment score", "Google confidence",
        "Calculated confidence", "Google gaps percentage",
        "Transcript gaps percentage", "Calculated score"
    ]
    t_pearson.add_row([
        "IOU",
        pearsonr_lists(results["ious"]["all_only"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["ious"]["all_only"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["ious"]["all_only"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["ious"]["all_only"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["ious"]["all_only"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["ious"]["all_only"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["ious"]["all_only"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["ious"]["all_only"],
                       results["scores"]["calculated"]["all"])
    ])
    t_pearson.add_row([
        "Deviation",
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["scores"]["deviation"]["all"],
                       results["scores"]["calculated"]["all"])
    ])
    t_pearson.add_row([
        "Alignment score",
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["scores"]["alignment_scores"]["all"],
                       results["scores"]["calculated"]["all"])
    ])
    t_pearson.add_row([
        "Google confidence",
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["scores"]["google_confidence"]["all"],
                       results["scores"]["calculated"]["all"])
    ])
    t_pearson.add_row([
        "Calculated confidence",
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["calculated"]["all"])
    ])
    t_pearson.add_row([
        "Google gaps percentage",
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["scores"]["google_gaps"]["all"],
                       results["scores"]["calculated"]["all"])
    ])
    t_pearson.add_row([
        "Transcript gaps percentage",
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["scores"]["transcript_gaps"]["all"],
                       results["scores"]["calculated"]["all"])
    ])
    t_pearson.add_row([
        "Calculated score",
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["ious"]["all_only"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["deviation"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["alignment_scores"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["google_confidence"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["calculated"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["google_gaps"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["transcript_gaps"]["all"]),
        pearsonr_lists(results["scores"]["calculated"]["all"],
                       results["scores"]["calculated"]["all"])
    ])

    bin_print(verbosity, 0, "Score correlations")
    bin_print(verbosity, 0, "\n" + str(t_pearson))

    end = time.time()

    bin_print(input_args["verbosity"], 0, "Done.")
    bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
def get_and_save_raw(input_path: str, bucket_name: str, out_path: str,
                     verbosity: int) -> None:
    """
    Gets raw JSON from Google Cloud Speech-to-text API

    :param input_path:  Path to read files from
    :param bucket_name: Name of the GCS bucket
    :param verbosity:   Verbosity level

    :return: None
    """
    bin_print(verbosity, 1, "Reading files from", input_path)

    bin_print(verbosity, 2, "Trying to find all .flac files...")
    flac_files = [
        f for f in listdir(input_path)
        if isfile(join(input_path, f)) and f.split(".")[1] == "flac"
    ]
    bin_print(verbosity, 3, "Found flac files:", flac_files)
    bin_print(verbosity, 3, "Total flac files:", len(flac_files))

    client = speech.SpeechClient()

    bin_print(verbosity, 1, "Running Google STT...")
    for flac_file in flac_files:
        if "stadt_zuerich" in flac_file:
            bin_print(verbosity, 2, "Processing " + flac_file)
            try:
                json = get_raw("gs://" + bucket_name + "/" + flac_file, client)
                json_path = out_path + "/" + flac_file.replace(
                    ".flac", "_google_output") + ".json"
                bin_print(verbosity, 2, "Writing " + json_path)
                f = open(json_path, "w")
                f.write(json)
                f.close()
            except _OperationNotComplete:
                bin_print(verbosity, 1, "Timeout for " + flac_file)
Exemplo n.º 12
0
    def align_per_sentence(cls, sentences: List[Sentence],
                           transcript_alignment: str, google_alignment: str,
                           google_words: List[object],
                           alignment_parameters: Dict[str, Any],
                           alignment_score: int,
                           verbosity: int) -> List[Sentence]:
        """
        Assigns start and end times to sentences based on given alignments.

        :param sentences:            All sentences
        :param transcript_alignment: Aligned transcript
        :param google_alignment:     Aligned google output
        :param google_words:         Google words, to get startTime and endTime
        :param alignment_parameters: Dict of parameters loaded from a given YAML file. See README for full config.
        :param alignment_score:      Score of the alignment
        :param verbosity:            Verbosity of output

        :return: List of aligned sentences
        """
        last_end_point = 0
        last_end_time = 0.0

        sentence_index = 0

        for sentence in sentences:
            start_time = time()

            sentence_characters = list(preprocess_string(sentence.sentence))

            sentence_regex = "-*".join(sentence_characters)

            try:
                alignment_match = re.search(
                    sentence_regex, transcript_alignment[last_end_point:])

                alignment_start_point = last_end_point + alignment_match.start(
                )
                alignment_end_point = last_end_point + alignment_match.end()

                last_end_point = last_end_point + alignment_match.end()
            except AttributeError as e:
                bin_print(
                    0, 0,
                    "--------------------------------------------------------------------------"
                )
                bin_print(0, 0, transcript_alignment[last_end_point:])
                bin_print(0, 0, "Attribute error", e,
                          "".join(sentence_characters), sentence_regex)
                # _Shouldn't_ happen, as the regexp is basically part of the transcript we're
                # looking at. Character's don't vanish from the transcript, so there's always a match.
                cls.mark_sentence_not_appearing(sentence, alignment_parameters,
                                                last_end_time)
                last_end_time = last_end_time + alignment_parameters[
                    "no_appearance"]["interval_length"]
                continue

            # Mostly none values on either side indicates a false positive, move to beginning of sentence with
            if is_mostly_none(list(google_alignment[alignment_start_point:alignment_end_point])) \
                    or is_mostly_none(list(transcript_alignment[alignment_start_point:alignment_end_point])):
                cls.mark_sentence_not_appearing(sentence, alignment_parameters,
                                                last_end_time)
                last_end_time = last_end_time + alignment_parameters[
                    "no_appearance"]["interval_length"]
                continue

            google_sub_start = len([
                c for c in google_alignment[0:alignment_start_point]
                if c is not "-" and c is not " "
            ])
            google_sub_end = len([
                c for c in google_alignment[0:alignment_end_point]
                if c is not "-" and c is not " "
            ])

            character_count = 0
            found_start = False

            start_word_confidence = 0.0
            end_word_confidence = 0.0

            for word in google_words:
                character_count += len(preprocess_string(word["word"]))
                word_start_time = float(word["startTime"].replace("s", ""))

                # Guarantee that there's no overlapping sentences
                if character_count >= google_sub_start and last_end_time <= word_start_time and not found_start:
                    sentence.interval.start = word_start_time
                    start_word_confidence = word["confidence"]
                    found_start = True

                if found_start and character_count >= google_sub_end:
                    sentence.interval.end = float(word["endTime"].replace(
                        "s", ""))
                    last_end_time = sentence.interval.end
                    end_word_confidence = word["confidence"]
                    break

            sentence_confidence = get_sentence_confidence(
                start_word_confidence, end_word_confidence,
                transcript_alignment[
                    alignment_start_point:alignment_end_point],
                google_alignment[alignment_start_point:alignment_end_point],
                alignment_parameters["algorithm"]["match_reward"],
                alignment_parameters["algorithm"]["mismatch_penalty"],
                alignment_parameters["algorithm"]["gap_penalty"])

            google_gaps_percentage = get_none_part(
                list(
                    google_alignment[alignment_start_point:alignment_end_point]
                ))
            transcript_gaps_percentage = get_none_part(
                list(transcript_alignment[
                    alignment_start_point:alignment_end_point]))

            sentence.additional_data = AdditionalData(
                sentence_confidence["average_google_confidence"],
                sentence_confidence["normalized_sentence_score"],
                google_gaps_percentage, transcript_gaps_percentage)

            overall_score = calculate_overall_score(
                google_gaps_percentage, transcript_gaps_percentage,
                sentence_confidence["average_google_confidence"],
                sentence_confidence["normalized_sentence_score"],
                alignment_parameters["score_weights"]["gaps_google"],
                alignment_parameters["score_weights"]["gaps_transcript"],
                alignment_parameters["score_weights"]["alignment_score"],
                alignment_parameters["score_weights"]["google_confidence"])

            if overall_score > alignment_parameters["filtering"]["threshold"]:
                if alignment_parameters["filtering"]["method"] == "mark":
                    sentence.sentence = "[BAD]" + sentence.sentence
                    sentence_index += 1
                else:
                    del (sentences[sentence_index])
            else:
                sentence_index += 1

            end_time = time()
            cls.execution_times.append(end_time - start_time)

            bin_print(verbosity, 2, "Sentence confidence:",
                      str(sentence_confidence))

        return sentences
def compare_alignments(input_path: str, verbosity: int, type1: str, type2: str,
                       training_only: bool,
                       config: Dict[str, Any]) -> Dict[str, Any]:
    """
    Compares all found alignments

    :param input_path:    Input path
    :param verbosity:     Verbosity level
    :param type1:         First type for comparison
    :param type2:         Second type for comparison
    :param training_only: Determines if a sentence has to be prefixed with [TEST] in order to be considered.
    :param config:        Configuration dict, see README

    :return: Dict of all results
    """

    if input_path.endswith(os.sep):
        input_path = input_path[:-1]

    epsilon = config["no_appearance"]["interval_length"]

    bin_print(verbosity, 1, "Reading files from", input_path)

    bin_print(verbosity, 2, "Trying to find all .txt files...")
    txt_files = [
        input_path + os.sep + f for f in listdir(input_path)
        if isfile(join(input_path, f)) and f.split(".")[1] == "txt"
    ]
    bin_print(verbosity, 3, "Found txt files:", txt_files)

    bin_print(
        verbosity, 2,
        "Filtering found files by ones containing alignment by " + type1 +
        "...")
    type1_alignments = [f for f in txt_files if "audacity_" + type1 in f]
    bin_print(verbosity, 3,
              "Found txt files containing alingment via " + type1 + ":",
              type1_alignments)

    ious = []
    low_ious = []
    google_confidences = []
    sentence_scores = []
    deviations = []
    google_gaps = []
    transcript_gaps = []
    total_sentences = 0
    sentences_appearing_true_positives = 0
    sentences_appearing_false_positives = 0
    sentences_appearing_true_negatives = 0
    sentences_appearing_false_negatives = 0

    ious_per_file = {}

    bin_print(verbosity, 2, "Processing all " + type1 + " alignments...")
    for type1_alignment in type1_alignments:
        file_name = type1_alignment.replace("audacity_" + type1, "").replace(
            input_path, "").replace("_.txt", "")
        bin_print(verbosity, 3, "Processing", file_name)
        type1_aligned_sentences = load_alignment(type1_alignment)
        try:
            type2_aligned_sentences = load_alignment(
                type1_alignment.replace("audacity_" + type1,
                                        "audacity_" + type2))
        except FileNotFoundError:
            # Corresponding file doesn't exist, skip it completely
            continue

        sentence_pairs = [
            pair for pair in list(
                zip(type1_aligned_sentences, type2_aligned_sentences))
            if (not training_only or pair[0].sentence.startswith("[TRAINING]"))
            and not pair[1].sentence.startswith(
                "[BAD]")  # Filter out "bad" sentences.
        ]

        total_sentences += len(sentence_pairs)

        current_ious = [
            (intersection_over_union(pair[0].interval, pair[1].interval),
             pair[0].interval.get_length(), pair[1].interval.get_length(),
             pair[0].sentence, file_name) for pair in sentence_pairs
            if (pair[0].interval.get_length() > epsilon
                and pair[1].interval.get_length() > epsilon)
        ]

        current_google_confidence = [
            (pair[1].additional_data.google_confidence
             if pair[1].additional_data else 0.001) for pair in sentence_pairs
            if (pair[0].interval.get_length() > epsilon
                and pair[1].interval.get_length() > epsilon)
        ]

        current_sentence_scores = [
            (pair[1].additional_data.normalized_sentence_score
             if pair[1].additional_data else 0.001) for pair in sentence_pairs
            if (pair[0].interval.get_length() > epsilon
                and pair[1].interval.get_length() > epsilon)
        ]

        current_transcript_gaps = [
            (pair[1].additional_data.gaps_transcript
             if pair[1].additional_data else 0.001) for pair in sentence_pairs
            if (pair[0].interval.get_length() > epsilon
                and pair[1].interval.get_length() > epsilon)
        ]

        current_google_gaps = [(pair[1].additional_data.gaps_google
                                if pair[1].additional_data else 0.001)
                               for pair in sentence_pairs
                               if (pair[0].interval.get_length() > epsilon
                                   and pair[1].interval.get_length() > epsilon)
                               ]

        current_deviations = [
            pair[0].interval.get_deviation(pair[1].interval)
            for pair in sentence_pairs
            if (pair[0].interval.get_length() > epsilon
                and pair[1].interval.get_length() > epsilon)
        ]

        # Find sentences that are marked on either side as not appearing at all.
        pairs_sentence_not_appearing = [
            pair for pair in sentence_pairs
            if (pair[0].interval.get_length() <= epsilon
                or pair[1].interval.get_length() <= epsilon)
        ]

        # Count those sentences: which of those don't appear in both oder in either one?
        for pair in pairs_sentence_not_appearing:
            if not does_sentence_appear(pair[0],
                                        epsilon) and not does_sentence_appear(
                                            pair[1], epsilon):
                sentences_appearing_true_negatives += 1
            elif not does_sentence_appear(pair[0],
                                          epsilon) and does_sentence_appear(
                                              pair[1], epsilon):
                sentences_appearing_false_positives += 1
            elif does_sentence_appear(
                    pair[0],
                    epsilon) and not does_sentence_appear(pair[1], epsilon):
                sentences_appearing_false_negatives += 1

        # All sentences appearing in both are considered true negatives
        sentences_appearing_true_positives += len(current_ious)

        if len(current_ious) == 0:
            bin_print(verbosity, 2, "No sentences found, skipping...")
            continue

        if len(current_ious) > 0:
            mean_iou = np.mean([v[0] for v in current_ious])
            median_iou = np.median([v[0] for v in current_ious])
        else:
            mean_iou = np.nan
            median_iou = np.nan

        ious_per_file[file_name] = {
            "mean": mean_iou,
            "median": median_iou,
            "all": current_ious
        }

        if mean_iou <= 0.3:
            low_ious.append(file_name + ".wav")

        ious += current_ious
        google_confidences += current_google_confidence
        sentence_scores += current_sentence_scores
        deviations += current_deviations
        google_gaps += current_google_gaps
        transcript_gaps += current_transcript_gaps

    try:
        precision = sentences_appearing_true_positives / (
            sentences_appearing_true_positives +
            sentences_appearing_false_positives)
    except ZeroDivisionError:
        precision = 0.0

    try:
        recall = sentences_appearing_true_positives / (
            sentences_appearing_true_positives +
            sentences_appearing_false_negatives)
    except ZeroDivisionError:
        recall = 0.0

    try:
        f1_score = 2 * ((precision * recall) / (precision + recall))
    except ZeroDivisionError:
        f1_score = 0.0

    return {
        "no_sentences": {
            "appearing": len(ious),
            "total": total_sentences,
        },
        "ious": {
            "all": ious,
            "all_only": [iou[0] for iou in ious],
            "low": low_ious,
            "mean": np.mean([v[0] for v in ious]) if len(ious) > 0 else np.nan,
            "median":
            np.median([v[0] for v in ious]) if len(ious) > 0 else np.nan,
            "per_file": ious_per_file
        },
        "scores": {
            "deviation": {
                "all": deviations,
                "mean": np.mean(deviations) if len(deviations) > 0 else np.nan,
                "median":
                np.median(deviations) if len(deviations) > 0 else np.nan,
            },
            "google_confidence": {
                "all":
                google_confidences,
                "mean":
                np.mean(google_confidences)
                if len(google_confidences) > 0 else np.nan,
                "median":
                np.median(google_confidences)
                if len(google_confidences) > 0 else np.nan
            },
            "alignment_scores": {
                "all":
                sentence_scores,
                "mean":
                np.mean(sentence_scores)
                if len(sentence_scores) > 0 else np.nan,
                "median":
                np.median(sentence_scores)
                if len(sentence_scores) > 0 else np.nan
            },
            "google_gaps": {
                "all":
                google_gaps,
                "mean":
                np.mean(google_gaps) if len(google_gaps) > 0 else np.nan,
                "median":
                np.median(google_gaps) if len(google_gaps) > 0 else np.nan
            },
            "transcript_gaps": {
                "all":
                transcript_gaps,
                "mean":
                np.mean(transcript_gaps)
                if len(transcript_gaps) > 0 else np.nan,
                "median":
                np.median(transcript_gaps)
                if len(transcript_gaps) > 0 else np.nan
            },
            "calculated": {
                "all": [
                    calculate_overall_score(
                        tuple[0],
                        tuple[1],
                        tuple[2],
                        tuple[3],
                        config["score_weights"]["gaps_google"],
                        config["score_weights"]["gaps_transcript"],
                        config["score_weights"]["alignment_score"],
                        config["score_weights"]["google_confidence"],
                    ) for tuple in zip(google_gaps, transcript_gaps,
                                       google_confidences, sentence_scores)
                ]
            }
        },
        "appearance": {
            "true_positives": sentences_appearing_true_positives,
            "false_positives": sentences_appearing_false_positives,
            "true_negatives": sentences_appearing_true_negatives,
            "false_negatives": sentences_appearing_false_negatives,
            "precision": precision,
            "recall": recall,
            "f1_score": f1_score,
        }
    }
def optimize_parameters(input_path: str, output_path: str,
                        google_files_aligner: GoogleFilesAligner,
                        alignment_parameters: Dict[str, Any],
                        convergence_plot_file: str, verbosity: int) -> None:
    """
    Tries to find the best parameters for google alignment.

    :param input_path:            Path to load all alignments from
    :param output_path:           Path to write the alignments to
    :param google_files_aligner:  GoogleFLiesAligner to re-align every epoch
    :param alignment_parameters:  Alignment parameters for comparison
    :param convergence_plot_file: Where to save the convergence plot
    :param verbosity:             Verbosity of the output

    :return: None
    """
    def optimize_function(params: List) -> float:
        """
        Function to optimize against

        :param params: Parameters given by BOpt

        :return: Calculated score
        """
        bin_print(verbosity, 1, "Starting new iteration...")

        google_files_aligner.alignment_parameters["algorithm"][
            "match_reward"] = params[0][0]
        google_files_aligner.alignment_parameters["algorithm"][
            "mismatch_penalty"] = params[0][1]
        google_files_aligner.alignment_parameters["algorithm"][
            "gap_penalty"] = params[0][2]

        bin_print(verbosity, 3, "Configured params: ",
                  google_files_aligner.alignment_parameters)

        google_files_aligner.align_files(input_path, output_path, 0)

        # Not "training_only", because we're using a further boiled down training set.
        result = compare_alignments(input_path, 0, "hand", "google", False,
                                    alignment_parameters)

        # Configurable, see config.example.yml
        score = eval(
            google_files_aligner.
            alignment_parameters["optimize_params_formula"],
            {"__builtins__": None}, {
                "deviation": result["scores"]["deviation"]["mean"],
                "iou": result["ious"]["mean"],
                "f1": result["appearance"]["f1_score"],
                "precision": result["appearance"]["precision"],
                "recall": result["appearance"]["recall"],
            })

        bin_print(verbosity, 1, "Parameters:                         ", params)
        bin_print(verbosity, 1, "Achieved score (smaller == better): ", score)

        return score

    domain = [
        {
            "name": "match_reward",
            "type": "continuous",
            "domain": (0, 100)
        },
        {
            "name": "mismatch_penalty",
            "type": "continuous",
            "domain": (-100, 0)
        },
        {
            "name": "gap_penalty",
            "type": "continuous",
            "domain": (-100, 0)
        },
    ]

    bopt = BayesianOptimization(f=optimize_function,
                                domain=domain,
                                model_type="GP",
                                acquisition_type="EI",
                                acquisition_jitter=0.05)

    bopt.run_optimization(max_iter=25)

    bopt.plot_convergence(filename=convergence_plot_file)

    bin_print(verbosity, 0, "Best values:", bopt.x_opt)
Exemplo n.º 15
0
def optimize_score(input_path: str, alignment_parameters: Dict[str, Any],
                   convergence_plot_file: str, verbosity: int) -> None:
    """
    Tries to find the best parameters for overall score.

    :param input_path:            Path to load all alignments from
    :param alignment_parameters:  Alignment parameters for comparison
    :param convergence_plot_file: Where to save the convergence plot
    :param verbosity:             Verbosity of the output

    :return: None
    """
    def optimize_function(params: List) -> float:
        """
        Function to optimize against

        :param params: Parameters given by BOpt

        :return: Calculated score
        """
        bin_print(verbosity, 2, "Parameters: ", params)

        alignment_parameters["score_weights"]["gaps_google"] = params[0][0]
        alignment_parameters["score_weights"]["gaps_transcript"] = params[0][1]
        alignment_parameters["score_weights"]["alignment_score"] = params[0][2]
        alignment_parameters["score_weights"]["google_confidence"] = params[0][
            3]

        results = compare_alignments(input_path, 0, "hand", "google", True,
                                     alignment_parameters)

        correlation_ious = pearsonr_lists(
            results["scores"]["ious"]["all"],
            results["scores"]["calculated"]["all"])
        correlation_deviation = pearsonr_lists(
            results["scores"]["deviation"]["all"],
            results["scores"]["calculated"]["all"])

        bin_print(verbosity, 1, "Correlation IOUs: ", correlation_ious)
        bin_print(verbosity, 1, "Correlation deviation: ",
                  correlation_deviation)

        # Only maximize correlation with IOU
        return abs(correlation_ious)

    domain = [
        {
            "name": "gaps_google",
            "type": "continuous",
            "domain": (-100, 100)
        },
        {
            "name": "gaps_transcript",
            "type": "continuous",
            "domain": (-100, 100)
        },
        {
            "name": "alignment_score",
            "type": "continuous",
            "domain": (-100, 100)
        },
        {
            "name": "google_confidence",
            "type": "continuous",
            "domain": (-100, 100)
        },
    ]

    bopt = BayesianOptimization(f=optimize_function,
                                domain=domain,
                                model_type="GP",
                                acquisition_type="EI",
                                acquisition_jitter=0.05,
                                maximize=True)

    bopt.run_optimization(max_iter=250)

    bopt.plot_convergence(filename=convergence_plot_file)

    bin_print(verbosity, 0, "Best values:", bopt.x_opt)
Exemplo n.º 16
0
    def align_files(self, input_path: str, output_path: str, verbosity: int) -> None:
        """
        Aligns all given files in input_path and writes alignments into output_path

        :param input_path:           Where to look for transcript files
        :param output_path:          Where to write alignment files
        :param verbosity:            Verbosity of debugging output

        :return: None
        """
        bin_print(verbosity, 1, "Loading all transcript files from " + input_path + "...")
        file_names = [f.replace(".wav", "") for f in listdir(input_path) if
                      isfile(join(input_path, f)) and f.split(".")[1] == "wav"]

        bin_print(verbosity, 3, "Found files:", file_names)

        for file in file_names:
            bin_print(verbosity, 2, "Aligning " + file + "...")
            transcript_file = file + ".txt"

            with open(join(input_path, transcript_file), encoding="utf-8-sig") as read_file:
                transcript = read_file.read()

            with open(join(input_path, file + "_google_output.json"), "r", encoding="utf-8-sig") as read_file:
                # Convert back to object-like structure, so the underlying
                # alignment function doesn't imply non-object like structures,
                # such as dicts. This is particularly useful when working with
                # Googles output directly.
                google_output = load(read_file)
                google_output = Struct(**google_output)
                google_output.results = [Struct(**r) for r in google_output.results]

            alignment = self.aligner.align(google_output, transcript, verbosity, self.alignment_parameters)

            output_filename = output_path + "/" + file + "_audacity_" + self.alignment_type + ".txt"
            with open(output_filename, "w+", encoding="utf-8") as f:
                f.write("\n".join([sentence.to_audacity_label_format() for sentence in alignment]))
                bin_print(verbosity, 2, "Wrote " + output_filename)
                f.close()

        bin_print(verbosity, 0, "Execution time per sentence (mean): ", (np.mean(self.aligner.execution_times) + np.mean(self.aligner.alignment_times)))
        bin_print(verbosity, 0, "Execution time per sentence (max):  ", (np.max(self.aligner.execution_times) + np.max(self.aligner.alignment_times)))