示例#1
0
def compute_translation_statistics(tr, translation_lengths, long_trs,
                                   very_long_trs, translation_vocab):
    """
    Compute statistics related to translation
    :param tr: current translation
    :param translation_lengths: list of all translation lengths
    :param long_trs: counter for long translations
    :param very_long_trs: counter for very long translations
    :param translation_vocab: vocabulary of all the words in different translations
    :return: the four last parameters to the function after updated for current translation
    """
    translation_lengths.append(len(tr.split()))

    if len(tr.split()) > 50:
        long_trs += 1

    if len(tr.split()) > 200:
        very_long_trs += 1

    for word in tr.split():
        word = word.replace(",", "").replace("!", "").replace("?", "").replace(
            ":", "").replace(";", "")
        if word.replace(".", "") == "":
            word = "..."
        else:
            word = word.replace(".", "")
        increment_count(translation_vocab, word)

    return translation_lengths, long_trs, very_long_trs, translation_vocab
示例#2
0
def hmm_preprocess(train_sents):
    """
    train the HMM model
    :param train_sents: train sentences for the model
    :return: counts of unigrams, bigrams and trigrams
    """

    print("Start training")
    total_tokens = 0
    q_tri_counts, q_bi_counts, q_uni_counts, e_word_tag_counts, e_tag_counts = {}, {}, {}, {}, {}

    # e_tag_counts
    for sentence in train_sents:
        for token in sentence:
            key = token[1]
            increment_count(e_tag_counts, key)

    # e_word_tag_counts
    for sentence in train_sents:
        for token in sentence:
            key = token
            increment_count(e_word_tag_counts, key)

    # New update to enhance performance.
    most_common_tag = {}
    for word, tag in e_word_tag_counts:
        if word not in most_common_tag:
            most_common_tag[word] = (tag, e_word_tag_counts[word, tag])
        elif e_word_tag_counts[word, tag] > most_common_tag[word][1]:
            most_common_tag[word] = (tag, e_word_tag_counts[word, tag])
    most_common_tag["default"] = max(e_tag_counts, key=e_tag_counts.get)

    # Add *, * to beginning of every sentence and STOP to every end.
    adjusted_sents = []
    for sentence in train_sents:
        adjusted_sentence = []
        adjusted_sentence.append(('<s>', '<s>'))
        adjusted_sentence.append(('<s>', '<s>'))
        for token in sentence:
            adjusted_sentence.append(token)
        adjusted_sentence.append(('</s>', '</s>'))
        adjusted_sents.append(adjusted_sentence)

    # total_tokens
    for sentence in adjusted_sents:
        total_tokens += (len(sentence) - 2)

    # q_uni_counts
    for sentence in adjusted_sents:
        for token in sentence:
            key = token[1]
            increment_count(q_uni_counts, key)

    # q_bi_counts
    for sentence in adjusted_sents:
        for i in range(1, len(sentence)):
            key = (sentence[i - 1][1], sentence[i][1])
            increment_count(q_bi_counts, key)

    # q_tri_counts
    for sentence in adjusted_sents:
        for i in range(2, len(sentence)):
            key = (sentence[i - 2][1], sentence[i - 1][1], sentence[i][1])
            increment_count(q_tri_counts, key)

    # possible tags
    possible_tags = {}
    for sentence in train_sents:
        for token in sentence:
            if token[0] in possible_tags:
                possible_tags[token[0]].add(token[1])
            else:
                possible_tags[token[0]] = {token[1]}

    return total_tokens, q_tri_counts, q_bi_counts, q_uni_counts, e_word_tag_counts, e_tag_counts, most_common_tag, \
           possible_tags
示例#3
0
def add_translation_to_file(prev_signs,
                            signs_vocab,
                            prev_transcription,
                            transcription_vocab,
                            prev_tr,
                            translation_lengths,
                            long_trs,
                            very_long_trs,
                            translation_vocab,
                            prev_text,
                            prev_start_line,
                            prev_end_line,
                            signs_file,
                            transcription_file,
                            translation_file,
                            could_divide_by_three_dots,
                            could_not_divide,
                            metadata=False,
                            divide_by_three_dots=True):
    """
    Add a translation with corresponding signs and transliterations to files
    :param prev_signs: previous signs written to file
    :param signs_vocab: vocabulary of all the signs
    :param prev_transcription: previous transliterations written to file
    :param transcription_vocab: vocabulary of all the transliterations
    :param prev_tr: previous translation written to file
    :param translation_lengths: list of all translation lengths
    :param long_trs: counter for long translations
    :param very_long_trs: counter for very long translations
    :param translation_vocab: vocabulary of all the words in different translations
    :param prev_text: previous text written to file
    :param prev_start_line: previous start line written to file
    :param prev_end_line: previous end line written to file
    :param signs_file: file of all signs, being built as input for translation algorithms
    :param transcription_file: file of all transliterations, being built as input for translation algorithms
    :param translation_file: file of all translations, being built as input for translation algorithms
    :param could_divide_by_three_dots: counter for translations possible to divide based on three dots
    :param could_not_divide: counter for translations not possible to divide based on three dots
    :param metadata: should add the id of each sample to the files
    :return: some of the parameters to the function, after update
    """
    signs = ""
    transcription = ""

    for sign in prev_signs:
        signs += sign
        increment_count(signs_vocab, sign)

    for t, delim in prev_transcription:
        transcription += t + delim
        increment_count(transcription_vocab, t)

    signs = clean_signs_transcriptions(signs, True)
    transcription = clean_signs_transcriptions(transcription, False)

    real_key = [
        prev_text + "." + str(prev_start_line),
        prev_text + "." + str(prev_end_line)
    ]

    splitted_signs = [s for s in signs.split("...") if s != "" and s != " "]
    splitted_transcription = [
        t for t in transcription.split("... ") if t != "" and t != " "
    ]
    splitted_translation = [
        tr for tr in prev_tr.split("... ") if tr != "" and tr != " "
    ]

    # Write to files
    if len(splitted_signs) == len(splitted_transcription) and len(splitted_transcription) == len(splitted_translation) \
            and divide_by_three_dots:
        could_divide_by_three_dots += 1

        for i in range(len(splitted_signs)):
            if metadata:
                signs_file.write(
                    str(real_key) + "[" + str(i + 1) + "]: " +
                    splitted_signs[i] + "\n")
                transcription_file.write(
                    str(real_key) + "[" + str(i + 1) + "]: " +
                    splitted_transcription[i] + "\n")
                translation_file.write(
                    str(real_key) + "[" + str(i + 1) + "]: " +
                    splitted_translation[i] + "\n")
            else:
                signs_file.write(splitted_signs[i] + "\n")
                transcription_file.write(splitted_transcription[i] + "\n")
                translation_file.write(splitted_translation[i] + "\n")

            translation_lengths, long_trs, very_long_trs, translation_vocab = \
                compute_translation_statistics(splitted_translation[i], translation_lengths, long_trs, very_long_trs,
                                               translation_vocab)

    else:
        could_not_divide += 1
        if metadata:
            signs_file.write(str(real_key) + ": " + signs + "\n")
            transcription_file.write(
                str(real_key) + ": " + transcription + "\n")
            translation_file.write(str(real_key) + ": " + prev_tr + "\n")
        else:
            signs_file.write(signs + "\n")
            transcription_file.write(transcription + "\n")
            translation_file.write(prev_tr + "\n")

        translation_lengths, long_trs, very_long_trs, translation_vocab = \
            compute_translation_statistics(prev_tr, translation_lengths, long_trs, very_long_trs, translation_vocab)

    return signs_vocab, transcription_vocab, translation_lengths, long_trs, very_long_trs, translation_vocab, \
           could_divide_by_three_dots, could_not_divide
示例#4
0
def build_extra_decoding_arguments(train_sents):
    """
    Builds arguements for HMM, MEMM and BiLSTM (unigram, bigram, trigram, etc)
    :param train_sents: all sentences from training set
    :return: all extra arguments which your decoding procedures requires
    """

    extra_decoding_arguments = {}

    START_WORD, STOP_WORD = '<st>', '</s>'
    START_TAG, STOP_TAG = '*', 'STOP'
    e_word_tag_counts, e_tag_counts = {}, {}

    possible_tags = {}
    for sentence in train_sents:
        for token in sentence:
            if token[0] in possible_tags:
                possible_tags[token[0]].add(token[1])
            else:
                possible_tags[token[0]] = {token[1]}

    extra_decoding_arguments['possible_tags'] = possible_tags

    # New update to enhance performance.
    global most_common_tag
    most_common_tag = {}
    for word, tag in e_word_tag_counts:
        if word not in most_common_tag:
            most_common_tag[word] = (tag, e_word_tag_counts[word, tag])
        elif e_word_tag_counts[word, tag] > most_common_tag[word][1]:
            most_common_tag[word] = (tag, e_word_tag_counts[word, tag])

    adjusted_sents = []
    for sentence in train_sents:
        adjusted_sentence = []
        adjusted_sentence.append((START_WORD, START_TAG))
        adjusted_sentence.append((START_WORD, START_TAG))
        for token in sentence:
            adjusted_sentence.append(token)
        adjusted_sentence.append((STOP_WORD, STOP_TAG))
        adjusted_sents.append(adjusted_sentence)

    q_tri_counts, q_bi_counts, q_uni_counts = {}, {}, {}
    # q_uni_counts
    for sentence in adjusted_sents:
        for token in sentence:
            key = token[1]
            increment_count(q_uni_counts, key)
    S = q_uni_counts.keys()

    # q_bi_counts
    for sentence in adjusted_sents:
        for i in range(1, len(sentence)):
            key = (sentence[i - 1][1], sentence[i][1])
            increment_count(q_bi_counts, key)

    # q_tri_counts
    for sentence in adjusted_sents:
        for i in range(2, len(sentence)):
            key = (sentence[i - 2][1], sentence[i - 1][1], sentence[i][1])
            increment_count(q_tri_counts, key)

    extra_decoding_arguments['S'] = S
    cache_probability = {}
    extra_decoding_arguments['cache'] = cache_probability

    return extra_decoding_arguments