def align_trans_to_html_sentences(transcript_sentences, html_sentences, match_criteria=WORD_LENGTH): """ Does sentence-level alignment on the transcript and html file. Susceptible to inaccurate sentence boundaries. :param transcript_sentences: List of sentences in the transcript :param html_sentences: List of sentences in the HTML :param match_criteria: Length-based, # of words in a sentence, or # of characters? (WORD_LENGTH or CHAR_LENGTH) :return: """ if match_criteria == CHAR_LENGTH: transcript_lengths = list( map(lambda s: s.word_count, transcript_sentences)) html_sentence_lengths = list( map(lambda s: s.word_count, html_sentences)) else: transcript_lengths = list( map(lambda s: s.character_count, transcript_sentences)) html_sentence_lengths = list( map(lambda s: s.character_count, html_sentences)) result = align_blocks(html_sentence_lengths, transcript_lengths) print(transcript_lengths) print(html_sentence_lengths) print(result) sentence_pairs = [] for sentence_index_pair in result: sentence_pair = (html_sentences[sentence_index_pair[0]], transcript_sentences[sentence_index_pair[1]]) sentence_pairs.append(sentence_pair) return sentence_pairs
def __init__(self, par_level_bitext, filterlength=False): logging.debug("Creating sent level bitext from document pair") self.bi_sent = [] if par_level_bitext: for bp in par_level_bitext.bi_par: src_lengths = bp[0].get_sentence_lengths() tgt_lengths = bp[1].get_sentence_lengths() if debug: print "Align sentences : " + str(src_lengths) + " " + str( tgt_lengths) sent_alignments = single_alignments_to_sequence_pairs( align_blocks(src_lengths, tgt_lengths)) for a in sent_alignments: logging.debug("SENT ALIGN a=" + str(a)) # TODO: the following test on lenghts of a[0] and a[1] was added # after a crash on aligning ABET eng-zul # possible BUG hiding there if len(a[0]) and len(a[1]): #if True: new_src_sent = bp[0].sentences[a[0][0]] for i in range(1, len(a[0])): new_src_sent += bp[0].sentences[a[0][i]] new_tgt_sent = bp[1].sentences[a[1][0]] for i in range(1, len(a[1])): new_tgt_sent += bp[1].sentences[a[1][i]] if filterlength and (new_src_sent.token_length() > 30 or new_tgt_sent.token_length() > 30): continue self.bi_sent.append((new_src_sent, new_tgt_sent)) logging.info("%d sentence alignments", len(self.bi_sent))
def alignment(lang_source, lang_target, text_source, text_target): # get spacy models for language processing sp_source = utils.getSpacy(lang_source) sp_target = utils.getSpacy(lang_target) doc_source = sp_source(text_source) doc_target = sp_target(text_target) # if we use english, use load the embeddings in a single step to improve performances sent_source = [sent for sent in doc_source.sents] sent_target = [sent for sent in doc_target.sents] len_source = list( reversed([sent.end_char - sent.start_char for sent in sent_source])) len_target = list( reversed([sent.end_char - sent.start_char for sent in sent_target])) alignment = reversed(gale_church.align_blocks(len_source, len_target)) ls = len(len_source) lt = len(len_target) alignment = [(ls - x - 1, lt - y - 1) for x, y in alignment] src_set = set() tgt_set = set() blocks = [] last_source_end = 0 last_target_end = 0 for src_idx, tgt_idx in alignment: if src_idx in src_set or tgt_idx in tgt_set: src_set.add(src_idx) tgt_set.add(tgt_idx) else: if len(src_set) or len(tgt_set): src_bead = (last_source_end, last_source_end) tgt_bead = (last_target_end, last_target_end) if len(src_set): last_source_end = sent_source[max(src_set)].end_char src_bead = (sent_source[min(src_set)].start_char, last_source_end) if len(tgt_set): last_target_end = sent_target[max(tgt_set)].end_char tgt_bead = (sent_target[min(tgt_set)].start_char, last_target_end) blocks.append(src_bead + tgt_bead) src_set.clear() tgt_set.clear() src_set.add(src_idx) tgt_set.add(tgt_idx) if len(src_set) or len(tgt_set): src_bead = (last_source_end, last_source_end) tgt_bead = (last_target_end, last_target_end) if len(src_set): src_bead = (sent_source[min(src_set)].start_char, sent_source[max(src_set)].end_char) if len(tgt_set): tgt_bead = (sent_target[min(tgt_set)].start_char, sent_target[max(tgt_set)].end_char) blocks.append(src_bead + tgt_bead) return blocks
def bitext_from_documents(db, doc1, lang1, doc2, lang2): myDB = database.izwiDB(db, "") doc1id = myDB.get_docid(doc1, lang1) doc2id = myDB.get_docid(doc2, lang2) p_lengths1 = myDB.get_all_paragraph_lengths(doc1id) p_lengths2 = myDB.get_all_paragraph_lengths(doc2id) bitext = [] for para1, para2 in align_blocks([x[1] for x in p_lengths1], [x[1] for x in p_lengths2]): # convert from index in the list above to the ID in the DB: para1 = p_lengths1[para1][0] para2 = p_lengths2[para2][0] s_lengths1 = myDB.get_sentence_lengths(doc1id, para1) s_lengths2 = myDB.get_sentence_lengths(doc2id, para2) for s1, s2 in align_blocks([x[1] for x in s_lengths1], [x[1] for x in s_lengths2]): s1 = s_lengths1[s1][0] s2 = s_lengths2[s2][0] bitext.append((s1, s2)) return [ AlignedSent(s.split(), t.split()) for (s, t) in myDB.get_sentence_pairs(bitext) ]
def align_and_show(chap_en, chap_ba, en_output_path, ba_output_path, write=True): """ :param write: :param ba_output_path: :param en_output_path: :param chap: chapter number (int) :param chap_en: list of texts in the corresponding English Chapter :param chap_ba: ... Basque ... :return: """ def align_index(index_mapping): """ :param index_mapping: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (7, 8), (8, 9)] :return: [(0, [0]), (1, [1]), (2, [2]), (3, [3]), (4, [4]), (5, [5]), (6, [6]), (7, [7, 8]), (8, [9])] """ al = {} for (e, g) in index_mapping: if e not in al: al[e] = [] al[e].append(g) aligned_indexes = [(k, al[k]) for k in sorted(al.keys())] return aligned_indexes chap_en_leng = chapter_paras_length( chap_en ) # [105, 184, 150, 60, 113, 218, 88, 354, 138], length of each paragraph chap_ba_leng = chapter_paras_length( chap_ba) # [ 101, 154, 121, 45, 94, 192, 80, 159, 130, 116] index_mapping = gale_church.align_blocks(chap_en_leng, chap_ba_leng) aligned_indexes = align_index(index_mapping) en_aligned_text_list, ba_aligned_text_list = aligned_indexes2aligned_texts( aligned_indexes, chap_en, chap_ba, en_output_path, ba_output_path, write=write) return en_aligned_text_list, ba_aligned_text_list
def alignSentences(lang_source, lang_target, text_source, text_target): # get spacy models for language processing sp_source = utils.getSpacy(lang_source) sp_target = utils.getSpacy(lang_target) doc_source = sp_source(text_source) doc_target = sp_target(text_target) # if we use english, use load the embeddings in a single step to improve performance eng_doc = None if lang_source == utils.Lang.ENG: eng_doc = doc_source elif lang_target == utils.Lang.ENG: eng_doc = doc_target if not eng_doc is None: words = {t.lemma_.lower() for t in eng_doc if t.is_alpha} glove.getVector(words) sent_source = [sent.string.strip() for sent in doc_source.sents] sent_target = [sent.string.strip() for sent in doc_target.sents] len_source = [len(sent) for sent in sent_source] len_target = [len(sent) for sent in sent_target] alignment = gale_church.align_blocks(len_source, len_target) src_set = set() tgt_set = set() blocks = [] for src_idx, tgt_idx in alignment: if src_idx in src_set or tgt_idx in tgt_set: src_set.add(src_idx) tgt_set.add(tgt_idx) else: if len(src_set) or len(tgt_set): src_block = ' '.join([sent_source[i] for i in sorted(src_set)]) tgt_block = ' '.join([sent_target[i] for i in sorted(tgt_set)]) blocks.append((src_block, tgt_block)) src_set.clear() tgt_set.clear() src_set.add(src_idx) tgt_set.add(tgt_idx) if len(src_set) or len(tgt_set): src_block = ' '.join([sent_source[i] for i in sorted(src_set)]) tgt_block = ' '.join([sent_target[i] for i in sorted(tgt_set)]) blocks.append((src_block, tgt_block)) return blocks
def __init__(self, doc_level_bitext): logging.debug("Creating paragraph level bitext from document pair") src_lengths = doc_level_bitext.src_doc.get_paragraph_lengths() tgt_lengths = doc_level_bitext.tgt_doc.get_paragraph_lengths() par_alignments = single_alignments_to_sequence_pairs( align_blocks(src_lengths, tgt_lengths)) self.bi_par = [] for a in par_alignments: if debug: print "my par align = " + str(a) new_src_par = doc_level_bitext.src_doc.paragraphs[a[0][0]] #print "current number of sentences in new src par = "+str(len(new_src_par.sentences)) for i in range(1, len(a[0])): new_src_par += doc_level_bitext.src_doc.paragraphs[a[0][i]] new_tgt_par = doc_level_bitext.tgt_doc.paragraphs[a[1][0]] for i in range(1, len(a[1])): new_tgt_par += doc_level_bitext.tgt_doc.paragraphs[a[1][i]] self.bi_par.append((new_src_par, new_tgt_par)) logging.info("%d paragraph alignments", len(self.bi_par))
s_lengths2 = myDB.get_sentence_lengths(doc2id, para2) for s1, s2 in align_blocks([x[1] for x in s_lengths1], [x[1] for x in s_lengths2]): s1 = s_lengths1[s1][0] s2 = s_lengths2[s2][0] bitext.append((s1, s2)) return [ AlignedSent(s.split(), t.split()) for (s, t) in myDB.get_sentence_pairs(bitext) ] def count_sentence_lengths(file, lengths): with open(file, 'r') as f: for line in f: line = line.rstrip() line = re.sub(" ", "", line) lengths.append(len(line)) # takes two one-segment-per-line files as input if __name__ == '__main__': srclengths = [] tgtlengths = [] count_sentence_lengths(sys.argv[1], srclengths) count_sentence_lengths(sys.argv[2], tgtlengths) sent_alignments = align_blocks(srclengths, tgtlengths) for sp in sent_alignments: print sp
def chap_to_sent_align(chap): en_paras = [] ba_paras = [] with open( 'translation-dashboard/data/en-ba-para-align/en-chapter-' + str(chap) + '.txt', "r") as en_txt: for line in en_txt: # each line is a paragraph en_paras.append(line) ''' if("<para>" in line): loc = line.find("<para>") en_paras.append(line[:loc]) en_paras.append(line[loc+6:]) else: en_paras.append(line) ''' with open( 'translation-dashboard/data/en-ba-para-align/ba-chapter-' + str(chap) + '.txt', "r") as ba_txt: for line in ba_txt: # each line is a paragraph ba_paras.append(line) ''' if ("<para>" in line): loc = line.find("<para>") ba_paras.append(line[:loc]) ba_paras.append(line[loc + 6:]) else: ba_paras.append(line) ''' en_sents_lens = [] ba_sents_lens = [] for para in en_paras: en_sents = stokenizer(para) for i in range(len(en_sents)): #get the number of words in the sentence en_sents_lens.append(len(en_sents[i].split())) for para in ba_paras: ba_sents = stokenizer(para) for i in range(len(ba_sents)): #get the number of words in the sentence ba_sents_lens.append(len(ba_sents[i].split())) def align_index(index_mapping): """ :param index_mapping: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (7, 8), (8, 9)] :return: [(0, [0]), (1, [1]), (2, [2]), (3, [3]), (4, [4]), (5, [5]), (6, [6]), (7, [7, 8]), (8, [9])] """ al = {} for (e, g) in index_mapping: if e not in al: al[e] = [] al[e].append(g) aligned_indexes = [(k, al[k]) for k in sorted(al.keys())] return aligned_indexes sent_align = gale_church.align_blocks(en_sents_lens, ba_sents_lens) aligned_indexes = align_index(sent_align) #print(aligned_indexes) #[(0, [0]), (1, [1]), (2, [2]), (3, [3]), (4, [4]), (5, [5]), (6, [6]), (7, [7]), (8, [8]), (9, [9]), (10, [10]), (11, [11]), (12, [12]), (13, [13]), (14, [14]), (15, [15]), (16, [16]), (17, [17]), (18, [18]), (19, [19]), (20, [20]), (21, [21]), (22, [22]), (23, [23]), (24, [24]), (25, [25]), (26, [26]), (27, [27]), (28, [28]), (29, [29, 30]), (30, [31]), (31, [32]), (32, [33]), (33, [34]), (34, [35]), (35, [36]), (36, [37]), (37, [38, 39]), (38, [40]), (39, [41]), (40, [42]), (41, [43]), (42, [44]), (43, [45]), (44, [46]), (45, [47]), (46, [48]), (47, [49]), (48, [50]), (49, [51]), (50, [52]), (51, [53, 54]), (52, [55]), (53, [56]), (54, [57]), (55, [58, 59]), (56, [60]), (57, [61]), (58, [62]), (59, [63]), (60, [64]), (61, [65]), (62, [66]), (63, [67]), (64, [68]), (65, [69]), (66, [70]), (67, [71]), (68, [72]), (69, [73]), (70, [74])] en_path = 'corpora/english-modified.txt' ba_folder_path = 'corpora/basque' chap_en = get_english_chapter(en_path, chap) chap_ba = get_basque_chapter(ba_folder_path, chap) en_output_path = 'translation-dashboard/data/en-ba-sent-align/en-chapter-' + str( chap) + '.txt' ba_output_path = 'translation-dashboard/data/en-ba-sent-align/ba-chapter-' + str( chap) + '.txt' en_aligned_text_list = [] ba_aligned_text_list = [] for para in chap_en: en_aligned_text_list.extend( stokenizer(para)) # produces list of all sentences for para in chap_ba: ba_aligned_text_list.extend( stokenizer(para)) # produces list of all sentences #writing with open(en_output_path, 'wt', encoding='utf-8') as en_file: for en, ba_list in aligned_indexes: en_file.write(en_aligned_text_list[en] + '\n') with open(ba_output_path, 'wt', encoding='utf-8') as ba_file: for en, ba_list in aligned_indexes: ba_txt_list = [] if len(ba_list) > 1: for i in ba_list: ba_txt_list.append(ba_aligned_text_list[i]) ba_file.write('<sent>'.join(str(v) for v in ba_txt_list) + '\n') else: ba_file.write(ba_aligned_text_list[ba_list[0]] + '\n')
def align_block(self, block_n): fb = self.text_f.blocks[block_n] tb = self.text_t.blocks[block_n] self.aligned_blocks[block_n] = align_blocks(fb.stnc_lengths_char, tb.stnc_lengths_char)