def setUp(self): base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") word_pair_score = WordPairScore(word_scores) fin = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(fin) self.alignments = list(training_alignments_from_documents(A, B)) self.score = SentencePairScore() self.score.train(self.alignments, word_pair_score)
def default_sentence_pair_score(): base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores.csv") _, classifier_filepath = tempfile.mkstemp() training_file = os.path.join(base_path, "data", "test_training.csv") pairs = parse_training_file(training_file) classifier = SentencePairScore() classifier.train(pairs, WordPairScore(word_scores)) classifier.save(classifier_filepath) return classifier, classifier_filepath
def test_correlation_values(self): base_path = os.path.dirname(os.path.abspath(__file__)) parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") A, B = parallel_corpus_to_documents(parallel_corpus) self.alignments = [x for x in training_alignments_from_documents(A, B)] # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) cor = correlation(sentence_pair_score.classifier) for attr, value in cor.iteritems(): if value is not numpy.nan: self.assertTrue(-1 <= value <= 1)
def setUp(self): word_scores = os.path.join(data_path, "test_word_scores_big.csv") self.parallel_corpus = os.path.join(data_path, "parallel-en-es.txt") # Documents A, B = parallel_corpus_to_documents(self.parallel_corpus) self.document_a = A[:30] self.document_b = B[:30] training = training_alignments_from_documents(self.document_a, self.document_b) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(training, word_pair_score) # Yalign model document_aligner = SequenceAligner(sentence_pair_score, 0.49) self.model = YalignModel(document_aligner)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents( A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): random.seed(hash("Y U NO?")) base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(parallel_corpus) A = A[:25] B = B[:25] self.alignments = list(training_alignments_from_documents(A, B)) self.A, self.B, self.correct_alignments = \ list(training_scrambling_from_documents(A, B)) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) # Yalign model self.min_ = sentence_pair_score.min_bound self.max_ = sentence_pair_score.max_bound gap_penalty = (self.min_ + self.max_) / 2.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) self.model = YalignModel(document_aligner, 1)