示例#1
0
def parse_refs(filename):
    refs = []
    poss = []
    for line in open(filename):
        line = line.strip()
        refs.append(Alignment.fromstring(re.sub(r'[0-9]*p[0-9]*', "", line)))
        poss.append(Alignment.fromstring(line.replace('p', '-')))
    return refs, poss
示例#2
0
文件: ibm1.py 项目: aczapata/twitter
    def __align(self, sentence_pair):
        """
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The best alignment will be set in ``sentence_pair`` when the
        method returns. In contrast with the internal implementation of
        IBM models, the word indices in the ``Alignment`` are zero-
        indexed, not one-indexed.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent
        """
        best_alignment = []

        for j, trg_word in enumerate(sentence_pair.words):
            # Initialize trg_word to align with the NULL token
            best_prob = max(self.translation_table[trg_word][None],
                            IBMModel.MIN_PROB)
            best_alignment_point = None
            for i, src_word in enumerate(sentence_pair.mots):
                align_prob = self.translation_table[trg_word][src_word]
                if align_prob >= best_prob:  # prefer newer word in case of tie
                    best_prob = align_prob
                    best_alignment_point = i

            best_alignment.append((j, best_alignment_point))

        sentence_pair.alignment = Alignment(best_alignment)
示例#3
0
文件: 3.py 项目: CheshtaK/NLP-MT
    def train(self, parallel_corpus):
        counts = Model3Counts()
        for aligned_sentence in parallel_corpus:
            l = len(aligned_sentence.mots)
            m = len(aligned_sentence.words)

            sampled_alignments, best_alignment = self.sample(aligned_sentence)
            aligned_sentence.alignment = Alignment(
                best_alignment.zero_indexed_alignment())
            total_count = self.prob_of_alignments(sampled_alignments)

            for alignment_info in sampled_alignments:
                count = self.prob_t_a_given_s(alignment_info)
                normalized_count = count / total_count

                for j in range(1, m + 1):
                    counts.update_lexical_translation(normalized_count,
                                                      alignment_info, j)
                    counts.update_distortion(normalized_count, alignment_info,
                                             j, l, m)

                counts.update_null_generation(normalized_count, alignment_info)
                counts.update_fertility(normalized_count, alignment_info)

        existing_alignment_table = self.alignment_table
        self.reset_probabilities()
        self.alignment_table = existing_alignment_table

        self.maximize_lexical_translation_probabilities(counts)
        self.maximize_distortion_probabilities(counts)
        self.maximize_fertility_probabilities(counts)
        self.maximize_null_generation_probabilities(counts)
示例#4
0
def load_alignments(input_file_path: Path) -> List[Alignment]:
    alignments: List[Alignment] = []
    for line in load_corpus(input_file_path):
        if line.startswith("#"):
            continue
        alignments.append(Alignment.fromstring(line))
    return alignments
示例#5
0
def evaluate_model(f_sents, e_sents_orig, nr_f_words, e_sents, e_dict_inv,
                   trans_probs, gold_alignments):
    """
    Returns the current model performance in terms of translation perplexity
    :param f_sents: the set of french sentences
    :param e_sents_orig:
    :param nr_f_words:
    :param e_sents:
    :param e_dict_inv:
    :param trans_probs:
    :param gold_alignments:
    :return:
    """
    sent_perplexities = np.zeros(len(f_sents))
    sent_likelihoods = np.zeros(len(f_sents))
    sent_aers = np.zeros(len(f_sents))

    model_output = align_sentences(e_sents, f_sents, trans_probs, nr_f_words)
    for index, pair in enumerate(model_output):
        e_pred_sent = pair[0]
        f_sent = f_sents[index]
        alignment = pair[1]

        if VERBOSE: print("Sentence: {}".format(f_sent))
        if VERBOSE:
            if INDEX_WORDS:
                print("Predicted translation: {}".format(
                    decode_sentence(e_pred_sent, e_dict_inv)))
            else:
                print("Predicted translation: {}".format(e_pred_sent))
        if VERBOSE: print("Actual translation: {}".format(e_sents_orig[index]))
        if VERBOSE: print("Alignment: {}".format(alignment))
        if VERBOSE:
            print("Gold standard alignment: {}".format(gold_alignments[index]))

        sent_perplexities[index] = get_perplexity(e_pred_sent, f_sent,
                                                  trans_probs, nr_f_words)
        sent_likelihoods[index] = get_likelihood(e_pred_sent, f_sent,
                                                 trans_probs, nr_f_words)
        sent_aers[index] = metrics.alignment_error_rate(
            Alignment(gold_alignments[index]), Alignment(alignment))

    return [
        -np.sum(sent_perplexities),
        sum(sent_likelihoods) / len(sent_likelihoods),
        sum(sent_aers) / len(sent_aers)
    ]
示例#6
0
    def read_block(self, stream):
        block = [self._word_tokenizer.tokenize(sent_str)
                 for alignedsent_str in self._alignedsent_block_reader(stream)
                 for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)]
        if self._aligned:
            block[2] = Alignment.fromstring(" ".join(block[2])) # kludge; we shouldn't have tokenized the alignment string
            block = [AlignedSent(*block)]
        elif self._group_by_sent:
            block = [block[0]]
        else:
            block = block[0]

        return block
示例#7
0
def remove_nones(bitext):

    bitext_new = []
    regex1 = re.compile(r"\([0-9]+, None\), ", re.IGNORECASE)
    regex2 = re.compile(r"\(None, [0-9]+\), ", re.IGNORECASE)
    regex3 = re.compile(r"\([0-9]+, None\)", re.IGNORECASE)
    regex4 = re.compile(r"\(None, [0-9]+\)", re.IGNORECASE)

    for b in bitext:
        alignment_str = re.sub(regex1, "", b.alignment.unicode_repr())
        alignment_str = re.sub(regex2, "", alignment_str)
        alignment_str = re.sub(regex3, "", alignment_str)
        alignment_str = re.sub(regex4, "", alignment_str)

        alignment_str = alignment_str.replace("Alignment", "").replace("), ", "#").replace(", ", "-").replace("#(", " ").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace("#", "")
        bitext_new.append(AlignedSent(b.words, b.mots, Alignment.fromstring(alignment_str)))
    return bitext_new
示例#8
0
    def train(self, parallel_corpus):
        counts = Model4Counts()
        for aligned_sentence in parallel_corpus:
            m = len(aligned_sentence.words)

            # Sample the alignment space
            sampled_alignments, best_alignment = self.sample(aligned_sentence)
            # Record the most probable alignment
            aligned_sentence.alignment = Alignment(
                best_alignment.zero_indexed_alignment()
            )

            # E step (a): Compute normalization factors to weigh counts
            total_count = self.prob_of_alignments(sampled_alignments)

            # E step (b): Collect counts
            for alignment_info in sampled_alignments:
                count = self.prob_t_a_given_s(alignment_info)
                normalized_count = count / total_count

                for j in range(1, m + 1):
                    counts.update_lexical_translation(
                        normalized_count, alignment_info, j
                    )
                    counts.update_distortion(
                        normalized_count,
                        alignment_info,
                        j,
                        self.src_classes,
                        self.trg_classes,
                    )

                counts.update_null_generation(normalized_count, alignment_info)
                counts.update_fertility(normalized_count, alignment_info)

        # M step: Update probabilities with maximum likelihood estimates
        # If any probability is less than MIN_PROB, clamp it to MIN_PROB
        existing_alignment_table = self.alignment_table
        self.reset_probabilities()
        self.alignment_table = existing_alignment_table  # don't retrain

        self.maximize_lexical_translation_probabilities(counts)
        self.maximize_distortion_probabilities(counts)
        self.maximize_fertility_probabilities(counts)
        self.maximize_null_generation_probabilities(counts)
示例#9
0
文件: 1.py 项目: CheshtaK/NLP-MT
    def align(self, sentence_pair):

        best_alignment = []
        
        for j, trg_word in enumerate(sentence_pair.words):
            best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
            best_alignment_point = None

            for i, src_word in enumerate(sentence_pair.mots):
                align_prob = self.translation_table[trg_word][src_word]

                if align_prob >= best_prob:
                    best_prob = align_prob
                    best_alignment_point = i

            best_alignment.append((j, best_alignment_point))

        sentence_pair.alignment = Alignment(best_alignment)
示例#10
0
def main(args):
    forward = torch.load(
        args.forward,
        map_location='cpu')  # list of ['src', 'tgt', 'weights', 'metricss']
    backward = torch.load(
        args.backward,
        map_location='cpu')  # list of ['src', 'tgt', 'weights', 'metricss']
    assert len(forward) == len(backward)
    res = []
    if args.bialign is not None:
        assert args.ref is not None
        refs, poss = parse_refs(args.ref)
        bi_aligns = [
            Alignment.fromstring(line.strip()) for line in open(args.bialign)
        ]
        bi_metrics = [
            alignment_merics([hyp], [ref], [pos])
            for hyp, ref, pos in zip(bi_aligns, refs, poss)
        ]
        assert len(forward) == len(backward) == len(bi_aligns) == len(
            bi_metrics)
        for f, b, bi_align, bi_metric in zip(forward, backward, bi_aligns,
                                             bi_metrics):
            res_t = {}
            assert f['src'] == b['src'] and f['tgt'] == b['tgt']
            res_t['src'] = f['src']
            res_t['tgt'] = f['tgt']
            res_t['weights'] = merge_dict(f['weights'], b['weights'], args)
            res_t['metrics'] = merge_dict(f['metrics'], b['metrics'], args)
            res_t['weights']['bi_align'] = align_to_weights(
                bi_align, bi_align, f['src'], f['tgt'])
            res_t['metrics']['bi_align'] = bi_metric
            res.append(res_t)
    else:
        for f, b in zip(forward, backward):
            res_t = {}
            res_t['src'] = f['src']
            res_t['tgt'] = f['tgt']
            res_t['weights'] = merge_dict(f['weights'], b['weights'], args)
            res_t['metrics'] = merge_dict(f['metrics'], b['metrics'], args)
            res.append(res_t)

    output = args.output or args.forward
    torch.save(res, output)
示例#11
0
    def get_direct_lexicon(self,
                           include_special_tokens: bool = False) -> Lexicon:
        lexicon = Lexicon()
        source: Iterable[str] = load_corpus(self.model_dir / "src.txt")
        target: Iterable[str] = load_corpus(self.model_dir / "trg.txt")
        alignments: Iterable[str] = filter(
            lambda a: not a.startswith("#"),
            load_corpus(self.model_dir / "alignments.txt"))

        for src_str, trg_str, alignment_str in zip(source, target, alignments):
            src_words = src_str.split()
            trg_words = trg_str.split()
            alignment = Alignment.fromstring(alignment_str)
            for src_index, trg_index in alignment:
                if src_index >= len(src_words) or trg_index >= len(trg_words):
                    continue
                src_word = src_words[src_index]
                trg_word = trg_words[trg_index]
                lexicon.increment(src_word, trg_word)
        lexicon.normalize()
        return lexicon
示例#12
0
def eval(test_alignments):
    f = open(test_alignments, "r")

    # initializing our "counters" used for the aggregate scores
    sentence_pairs = 0
    ibm1_precision_sum, ibm1_recall_sum, ibm1_aer_sum, ibm1_f1_sum = 0, 0, 0, 0
    ibm2_precision_sum, ibm2_recall_sum, ibm2_aer_sum, ibm2_f1_sum = 0, 0, 0, 0

    for line in f:
        sentence_pairs += 1

        strs = line.split("\t")

        print("-" * 47)
        print("Length of foreign sentence: ", len(strs[0].split()))
        print(strs[0])
        print(strs[1], "\n")

        ibm1_aligns = Alignment.fromstring(strs[2])
        ibm2_aligns = Alignment.fromstring(strs[3])
        hand_aligns = Alignment.fromstring(strs[4])
        '''
        Evaluate the sentence pair's precisiona and recall by utilizing the
        built in ntlk.metrics precision and recall functions. The functions 
        parameters are the following:
            1. Reference ("Gold Standard"): our hand alignments that follow the same format
            as the system produced alignments
            2. Test: the alignments produced by the model which will be put in
            comparison with the hand alignments 
        '''

        ibm1_precision, ibm1_recall, ibm1_aer, ibm1_f1 = precision(hand_aligns, ibm1_aligns), recall(hand_aligns, ibm1_aligns), \
                                                         alignment_error_rate(hand_aligns, ibm1_aligns), f_measure(hand_aligns, ibm1_aligns)

        ibm2_precision, ibm2_recall, ibm2_aer, ibm2_f1 = precision(hand_aligns, ibm2_aligns), recall(hand_aligns, ibm2_aligns), \
                                                         alignment_error_rate(hand_aligns, ibm2_aligns), f_measure(hand_aligns, ibm2_aligns)

        # Add it to our aggregate calculations
        ibm1_precision_sum += ibm1_precision
        ibm1_recall_sum += ibm1_recall
        ibm1_aer_sum += ibm1_aer
        ibm1_f1_sum += ibm1_f1

        ibm2_precision_sum += ibm2_precision
        ibm2_recall_sum += ibm2_recall
        ibm2_aer_sum += ibm2_aer
        ibm2_f1_sum += ibm2_f1

        print("IBM1 Precision: ", ibm1_precision, "\t", "IBM2 Precision: ",
              ibm2_precision)
        print("IBM1 Recall: ", ibm1_recall, "\t", "IBM2 Recall: ", ibm2_recall)
        print("IBM1 AER:", ibm1_aer, "\t", "IBM2 AER: ", ibm2_aer)
        print("IBM1 F1: ", ibm1_f1, "\t", "IBM2 F1: ", ibm2_f1)
        print("-" * 47, "\n")
    f.close()

    # Prints out the total statistics of the dataset
    print("-" * 23, "AVERAGE STATS", "-" * 23)
    print("Average IBM1 Precision: ", ibm1_precision_sum / sentence_pairs,
          "\t" * 2, "Average IBM2 Precision: ",
          ibm2_precision_sum / sentence_pairs)
    print("Average IBM1 Recall: ", ibm1_recall_sum / sentence_pairs, "\t" * 2,
          "Average IBM2 Recall: ", ibm2_recall_sum / sentence_pairs)
    print("Average IBM1 AER:", ibm1_aer_sum / sentence_pairs, "\t" * 2,
          "Average IBM2 AER: ", ibm2_aer_sum / sentence_pairs)
    print("Average IBM1 F1: ", ibm1_f1_sum / sentence_pairs, "\t" * 2,
          "Average IBM2 F1: ", ibm2_f1_sum / sentence_pairs)
示例#13
0
				if word.isspace():
					pass
				if word[0].isdigit():
					alignment.append(word.replace('\n', ''))
				elif word[0].isupper(): 
					pron.append(word)
			alignments.append((list(line[0]), pron, " ".join(alignment)))
		#else:
		#	break
		#iterations = iterations - 1
	f.close()
	return alignments

	

align_sents = read_corpus(file)
bitext = []
for alignment in align_sents:
	bitext.append(AlignedSent(alignment[0], alignment[1], Alignment.fromstring(alignment[2])))

model = IBMModel1(bitext, 5)

test_sentence = bitext[2]
print test_sentence.words
print test_sentence.mots
print test_sentence.alignment

#print('{0:.3f}'.format(model.translation_table['a']['AA']))

#print bitext[365]
示例#14
0
from pprint import pprint
import goslate
gsd = goslate.Goslate(service_urls=['http://translate.google.fr'])
gs = goslate.Goslate()
sun_fr = gsd.lookup_dictionary('sun', 'fr')

print('Goslate')
pprint(sun_fr)

from nltk.translate import AlignedSent, Alignment

algnsent = AlignedSent(
    ['klein', 'ist', 'das', 'Haus'],  # you need parralle
    ['the', 'house', 'is', 'small'],  # corpora
    Alignment.fromstring('0-2 1-3 2-1 3-0'))
# and alignements

print("nltk translate")
print(algnsent.words, algnsent.mots)