def dict_and_bitext(lemmatizer, unigrams_path, phrases_path): import pandas as pd from pandas import ExcelFile from nltk.translate import AlignedSent, Alignment import sys import os path = os.path.dirname(os.path.realpath(__file__)) sys.path.append(path) from lemmatize import lemmatize import re from collections import defaultdict dict = {} bitext = [] tl = [] xl = pd.ExcelFile(unigrams_path) df = xl.parse('Sheet1') for i in df.index: sys.stdout.write("dict_and_bitext unigrams progress: %f%% \r" % (100 * i / df['a'].count())) sys.stdout.flush() if type(df['a'][i]) != float and type(df['b'][i]) != float and type( df['c'][i]) != float: bitext.append(AlignedSent(df['c'][i].split(), df['a'][i].split())) t = (lemmatize( lemmatizer, re.sub( '( |-|\u002d|\u005f|\u00ad|\u0331|\u0332|\u0335|\u0336|\u2012|\u2013|\u2014|\u2015|\u2017|\u2212|\u2500)+', '_', df['a'][i]).strip(), df['b'][i].strip()), df['b'][i].strip(), 'def', df['c'][i].strip()) tl.append(t) sys.stdout.write("\n") xl = pd.ExcelFile(phrases_path) df = xl.parse('Sheet1') for i in df.index: sys.stdout.write("dict_and_bitext phrases progress: %f%% \r" % (100 * i / df['a'].count())) sys.stdout.flush() if type(df['a'][i]) != float and type(df['c'][i]) != float: bitext.append(AlignedSent(df['c'][i].split(), df['a'][i].split())) t = (lemmatize( lemmatizer, re.sub( '( |-|\u002d|\u005f|\u00ad|\u0331|\u0332|\u0335|\u0336|\u2012|\u2013|\u2014|\u2015|\u2017|\u2212|\u2500)+', '_', df['a'][i]).strip(), 'phrase.'), 'phrase.', 'def', df['c'][i].strip()) tl.append(t) sys.stdout.write("\n") dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) tl = list(set(tl)) for x, y, z, v in tl: dict[x][y][z].append(v) return dict, bitext
def test_vocabularies_are_initialized(self): parallel_corpora = [ AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']), AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']), AlignedSent([], ['sept']), ] ibm_model = IBMModel(parallel_corpora) self.assertEqual(len(ibm_model.src_vocab), 8) self.assertEqual(len(ibm_model.trg_vocab), 6)
def test_mini_nltk(): bitext = [] # bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) # print bitext[0] # bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) # bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) return bitext
def test_vocabularies_are_initialized(self): parallel_corpora = [ AlignedSent(["one", "two", "three", "four"], ["un", "deux", "trois"]), AlignedSent(["five", "one", "six"], ["quatre", "cinq", "six"]), AlignedSent([], ["sept"]), ] ibm_model = IBMModel(parallel_corpora) self.assertEqual(len(ibm_model.src_vocab), 8) self.assertEqual(len(ibm_model.trg_vocab), 6)
def test_set_uniform_translation_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # examine target words that are not in the training data domain self.assertEqual(model1.translation_table['parrot']['eier'], IBMModel.MIN_PROB)
def test_set_uniform_translation_probabilities(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / (target vocab size + 1) self.assertEqual(model1.translation_table['ham']['eier'], 1.0 / 3) self.assertEqual(model1.translation_table['eggs'][None], 1.0 / 3)
def test_set_uniform_distortion_probabilities(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model3 = IBMModel3(corpus, 0) # act model3.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / length of target sentence self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2) self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4)
def test_set_uniform_translation_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # examine target words that are not in the training data domain self.assertEqual(model1.translation_table["parrot"]["eier"], IBMModel.MIN_PROB)
def test_set_uniform_translation_probabilities(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / (target vocab size + 1) self.assertEqual(model1.translation_table["ham"]["eier"], 1.0 / 3) self.assertEqual(model1.translation_table["eggs"][None], 1.0 / 3)
def test_set_uniform_alignment_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model2 = IBMModel2(corpus, 0) # act model2.set_uniform_probabilities(corpus) # assert # examine i and j values that are not in the training data domain self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB) self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB)
def test_set_uniform_alignment_probabilities(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model2 = IBMModel2(corpus, 0) # act model2.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / (length of source sentence + 1) self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4) self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3)
def compile_corpus(filename): ''' Compile a corpus from a single data file containing English and Foreign sentence pairs that are tab-divided. All relevant information MUST BE contained within a single file. ''' corpus = [] f = open(filename, "r") for line in f: # lowercase all words in the line -- this includes eng + for sentence line = line.lower() strs = line.split("\t") # OUR PLAN: split string by tab and index into resultant array to # access the english sentence (first index) and # foreign sentence (2nd index) eng_text = strs[0] for_text = strs[1] # Tokenize on white space tokenizer = TweetTokenizer() eng_words = tokenizer.tokenize(eng_text) for_words = tokenizer.tokenize(for_text) # Create alignment pairs and add to corpus aligned_sentence = AlignedSent(for_words, eng_words) corpus.append(aligned_sentence) f.close() return corpus
def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ["UNUSED"] + trg_sentence, None, ) translation_table = defaultdict(lambda: defaultdict(float)) translation_table["i"]["ich"] = 0.98 translation_table["love"]["gern"] = 0.98 translation_table["to"][None] = 0.98 translation_table["eat"]["esse"] = 0.98 translation_table["smoked"]["räucherschinken"] = 0.98 translation_table["ham"]["räucherschinken"] = 0.98 model1 = IBMModel1(corpus, 0) model1.translation_table = translation_table # act probability = model1.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 expected_probability = lexical_translation self.assertEqual(round(probability, 4), round(expected_probability, 4))
def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ["UNUSED"] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]], ) distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))) ) distortion_table[1][1][5][6] = 0.97 # i -> ich distortion_table[2][4][5][6] = 0.97 # love -> gern distortion_table[3][0][5][6] = 0.97 # to -> NULL distortion_table[4][2][5][6] = 0.97 # eat -> esse distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken translation_table = defaultdict(lambda: defaultdict(float)) translation_table["i"]["ich"] = 0.98 translation_table["love"]["gern"] = 0.98 translation_table["to"][None] = 0.98 translation_table["eat"]["esse"] = 0.98 translation_table["smoked"]["räucherschinken"] = 0.98 translation_table["ham"]["räucherschinken"] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]["ich"] = 0.99 fertility_table[1]["esse"] = 0.99 fertility_table[0]["ja"] = 0.99 fertility_table[1]["gern"] = 0.99 fertility_table[2]["räucherschinken"] = 0.999 fertility_table[1][None] = 0.99 probabilities = { "p1": 0.167, "translation_table": translation_table, "distortion_table": distortion_table, "fertility_table": fertility_table, "alignment_table": None, } model3 = IBMModel3(corpus, 0, probabilities) # act probability = model3.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97 expected_probability = ( null_generation * fertility * lexical_translation * distortion ) self.assertEqual(round(probability, 4), round(expected_probability, 4))
def create_corpus(english_list, hindi_list): corpus = [] for i in range(len(english_list)): corpus.append( AlignedSent(re.split("\s", english_list[i]), re.split("\s", hindi_list[i]))) return corpus
def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, None, ) translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 model1 = IBMModel1(corpus, 0) model1.translation_table = translation_table # act probability = model1.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 expected_probability = lexical_translation self.assertEqual(round(probability, 4), round(expected_probability, 4))
def test_best_model2_alignment(self): # arrange sentence_pair = AlignedSent( TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE ) # None and 'bien' have zero fertility translation_table = { 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0}, 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03}, 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0}, } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) ) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
def test_prune(self): # arrange alignment_infos = [ AlignmentInfo((1, 1), None, None, None), AlignmentInfo((1, 2), None, None, None), AlignmentInfo((2, 1), None, None, None), AlignmentInfo((2, 2), None, None, None), AlignmentInfo((0, 0), None, None, None), ] min_factor = IBMModel5.MIN_SCORE_FACTOR best_score = 0.9 scores = { (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold (1, 2): best_score, (2, 1): min_factor * best_score, # at threshold (2, 2): min_factor * best_score * 0.5, # low score (0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold } corpus = [AlignedSent(['a'], ['b'])] original_prob_function = IBMModel4.model4_prob_t_a_given_s # mock static method IBMModel4.model4_prob_t_a_given_s = staticmethod( lambda a, model: scores[a.alignment] ) model5 = IBMModel5(corpus, 0, None, None) # act pruned_alignments = model5.prune(alignment_infos) # assert self.assertEqual(len(pruned_alignments), 3) # restore static method IBMModel4.model4_prob_t_a_given_s = original_prob_function
def test_best_model2_alignment_handles_fertile_words(self): # arrange sentence_pair = AlignedSent( ['i', 'really', ',', 'really', 'love', 'ham'], TestIBMModel.__TEST_SRC_SENTENCE, ) # 'bien' produces 2 target words: 'really' and another 'really' translation_table = { 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0}, 'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09}, ',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7}, 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03}, 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0}, } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) ) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4)) self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
def test_set_uniform_distortion_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model3 = IBMModel3(corpus, 0) # act model3.set_uniform_probabilities(corpus) # assert # examine i and j values that are not in the training data domain self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB) self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB) self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]], ) distortion_table = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(float)))) distortion_table[1][1][5][6] = 0.97 # i -> ich distortion_table[2][4][5][6] = 0.97 # love -> gern distortion_table[3][0][5][6] = 0.97 # to -> NULL distortion_table[4][2][5][6] = 0.97 # eat -> esse distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]['ich'] = 0.99 fertility_table[1]['esse'] = 0.99 fertility_table[0]['ja'] = 0.99 fertility_table[1]['gern'] = 0.99 fertility_table[2]['räucherschinken'] = 0.999 fertility_table[1][None] = 0.99 probabilities = { 'p1': 0.167, 'translation_table': translation_table, 'distortion_table': distortion_table, 'fertility_table': fertility_table, 'alignment_table': None, } model3 = IBMModel3(corpus, 0, probabilities) # act probability = model3.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97 expected_probability = (null_generation * fertility * lexical_translation * distortion) self.assertEqual(round(probability, 4), round(expected_probability, 4))
def nltk_ibm_one(data, iter=5): dual_text = [] for d_i in range(len(data)): fr_sent = word_tokenize(data[d_i]['fr']) eng_sent = word_tokenize(data[d_i]['en']) dual_text.append(AlignedSent(fr_sent, eng_sent)) ibm_one = IBMModel1(dual_text, iter) print("Probability score for the: ") print(ibm_one.translation_table['maison']['house'])
def tokData(data, target = 'fr'): bitext = [] for pair in data: en_tok = word_tokenize(pair['en']) fr_tok = word_tokenize(pair[target]) bitext.append(AlignedSent(en_tok, fr_tok)) return bitext
def test_set_uniform_distortion_probabilities_of_non_domain_values(self): # arrange src_classes = {"schinken": 0, "eier": 0, "spam": 1} trg_classes = {"ham": 0, "eggs": 1, "spam": 2} corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model4 = IBMModel4(corpus, 0, src_classes, trg_classes) # act model4.set_uniform_probabilities(corpus) # assert # examine displacement values that are not in the training data domain self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB) self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB) self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB) self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB)
def test_set_uniform_vacancy_probabilities_of_non_domain_values(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model5 = IBMModel5(corpus, 0, src_classes, trg_classes) # act model5.set_uniform_probabilities(corpus) # assert # examine dv and max_v values that are not in the training data domain self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
def test_best_model2_alignment_handles_empty_trg_sentence(self): # arrange sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], ()) self.assertEqual(a_info.cepts, [[], [], [], [], []])
def test_best_model2_alignment_handles_empty_src_sentence(self): # arrange sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, []) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (0, 0, 0)) self.assertEqual(a_info.cepts, [[1, 2, 3]])
def _train_translation_model(self, X, y): translations = [] for row in range(0, X.shape[0]): title = _recompose_title(X, row) for label in _get_labels_of_row(row, y): translations.append(AlignedSent(title, [label])) ibm1 = IBMModel1(translations, 5) self.ibm1 = ibm1
def test_sample(self): # arrange sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE) ibm_model = IBMModel([]) ibm_model.prob_t_a_given_s = lambda x: 0.001 # act samples, best_alignment = ibm_model.sample(sentence_pair) # assert self.assertEqual(len(samples), 61)
def translation_model_generation(): bilingual_text = [] english_file = open("english_tokens.json", "r") english_text = english_file.read() english_list = json.loads(english_text) chinese_file = open("chinese_tokens.json", "r") chinese_text = chinese_file.read() chinese_list = json.loads(chinese_text) for iter in zip(chinese_list, english_list): bilingual_text.append(AlignedSent(iter[0], iter[1])) ibm1_model = ibm1.IBMModel1(bilingual_text, 10) return ibm1_model