def setUp(self): base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") word_pair_score = WordPairScore(word_scores) fin = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(fin) self.alignments = list(training_alignments_from_documents(A, B)) self.score = SentencePairScore() self.score.train(self.alignments, word_pair_score)
def classifier_precision(document_a, document_b, model): """ Runs a ten-fold validation on the classifier and returns a value between 0 and 100. Higher is better. """ if len(document_a) == 0 and len(document_b) == 0: return 0.0 training = training_alignments_from_documents(document_a, document_b) problem = model.sentence_pair_score.problem score = kfold(training, problem, SVMClassifier) return round(score * 100, 2)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents( A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): random.seed(hash("Y U NO?")) base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(parallel_corpus) A = A[:25] B = B[:25] self.alignments = list(training_alignments_from_documents(A, B)) self.A, self.B, self.correct_alignments = list(training_scrambling_from_documents(A, B)) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) # Yalign model self.min_ = sentence_pair_score.min_bound self.max_ = sentence_pair_score.max_bound gap_penalty = (self.min_ + self.max_) / 2.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) self.model = YalignModel(document_aligner, 1)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents(A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): random.seed(hash("Y U NO?")) base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(parallel_corpus) A = A[:25] B = B[:25] self.alignments = list(training_alignments_from_documents(A, B)) self.A, self.B, self.correct_alignments = \ list(training_scrambling_from_documents(A, B)) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) # Yalign model self.min_ = sentence_pair_score.min_bound self.max_ = sentence_pair_score.max_bound gap_penalty = (self.min_ + self.max_) / 2.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) self.model = YalignModel(document_aligner, 1)