示例#1
0
def align_trans_to_html_sentences(transcript_sentences,
                                  html_sentences,
                                  match_criteria=WORD_LENGTH):
    """
    Does sentence-level alignment on the transcript and html file.
    Susceptible to inaccurate sentence boundaries.
    :param transcript_sentences: List of sentences in the transcript
    :param html_sentences: List of sentences in the HTML
    :param match_criteria: Length-based, # of words in a sentence, or # of characters? (WORD_LENGTH or CHAR_LENGTH)
    :return:
    """
    if match_criteria == CHAR_LENGTH:
        transcript_lengths = list(
            map(lambda s: s.word_count, transcript_sentences))
        html_sentence_lengths = list(
            map(lambda s: s.word_count, html_sentences))
    else:
        transcript_lengths = list(
            map(lambda s: s.character_count, transcript_sentences))
        html_sentence_lengths = list(
            map(lambda s: s.character_count, html_sentences))

    result = align_blocks(html_sentence_lengths, transcript_lengths)
    print(transcript_lengths)

    print(html_sentence_lengths)

    print(result)
    sentence_pairs = []
    for sentence_index_pair in result:
        sentence_pair = (html_sentences[sentence_index_pair[0]],
                         transcript_sentences[sentence_index_pair[1]])
        sentence_pairs.append(sentence_pair)

    return sentence_pairs
示例#2
0
 def __init__(self, par_level_bitext, filterlength=False):
     logging.debug("Creating sent level bitext from document pair")
     self.bi_sent = []
     if par_level_bitext:
         for bp in par_level_bitext.bi_par:
             src_lengths = bp[0].get_sentence_lengths()
             tgt_lengths = bp[1].get_sentence_lengths()
             if debug:
                 print "Align sentences : " + str(src_lengths) + " " + str(
                     tgt_lengths)
             sent_alignments = single_alignments_to_sequence_pairs(
                 align_blocks(src_lengths, tgt_lengths))
             for a in sent_alignments:
                 logging.debug("SENT ALIGN a=" + str(a))
                 # TODO: the following test on lenghts of a[0] and a[1] was added
                 # after a crash on aligning ABET eng-zul
                 # possible BUG hiding there
                 if len(a[0]) and len(a[1]):
                     #if True:
                     new_src_sent = bp[0].sentences[a[0][0]]
                     for i in range(1, len(a[0])):
                         new_src_sent += bp[0].sentences[a[0][i]]
                     new_tgt_sent = bp[1].sentences[a[1][0]]
                     for i in range(1, len(a[1])):
                         new_tgt_sent += bp[1].sentences[a[1][i]]
                     if filterlength and (new_src_sent.token_length() > 30
                                          or
                                          new_tgt_sent.token_length() > 30):
                         continue
                     self.bi_sent.append((new_src_sent, new_tgt_sent))
         logging.info("%d sentence alignments", len(self.bi_sent))
示例#3
0
def alignment(lang_source, lang_target, text_source, text_target):
    # get spacy models for language processing
    sp_source = utils.getSpacy(lang_source)
    sp_target = utils.getSpacy(lang_target)
    doc_source = sp_source(text_source)
    doc_target = sp_target(text_target)

    # if we use english, use load the embeddings in a single step to improve performances
    sent_source = [sent for sent in doc_source.sents]
    sent_target = [sent for sent in doc_target.sents]
    len_source = list(
        reversed([sent.end_char - sent.start_char for sent in sent_source]))
    len_target = list(
        reversed([sent.end_char - sent.start_char for sent in sent_target]))
    alignment = reversed(gale_church.align_blocks(len_source, len_target))
    ls = len(len_source)
    lt = len(len_target)
    alignment = [(ls - x - 1, lt - y - 1) for x, y in alignment]
    src_set = set()
    tgt_set = set()
    blocks = []
    last_source_end = 0
    last_target_end = 0
    for src_idx, tgt_idx in alignment:
        if src_idx in src_set or tgt_idx in tgt_set:
            src_set.add(src_idx)
            tgt_set.add(tgt_idx)
        else:
            if len(src_set) or len(tgt_set):
                src_bead = (last_source_end, last_source_end)
                tgt_bead = (last_target_end, last_target_end)
                if len(src_set):
                    last_source_end = sent_source[max(src_set)].end_char
                    src_bead = (sent_source[min(src_set)].start_char,
                                last_source_end)
                if len(tgt_set):
                    last_target_end = sent_target[max(tgt_set)].end_char
                    tgt_bead = (sent_target[min(tgt_set)].start_char,
                                last_target_end)
                blocks.append(src_bead + tgt_bead)
            src_set.clear()
            tgt_set.clear()
            src_set.add(src_idx)
            tgt_set.add(tgt_idx)
    if len(src_set) or len(tgt_set):
        src_bead = (last_source_end, last_source_end)
        tgt_bead = (last_target_end, last_target_end)
        if len(src_set):
            src_bead = (sent_source[min(src_set)].start_char,
                        sent_source[max(src_set)].end_char)
        if len(tgt_set):
            tgt_bead = (sent_target[min(tgt_set)].start_char,
                        sent_target[max(tgt_set)].end_char)
        blocks.append(src_bead + tgt_bead)
    return blocks
示例#4
0
def bitext_from_documents(db, doc1, lang1, doc2, lang2):
    myDB = database.izwiDB(db, "")
    doc1id = myDB.get_docid(doc1, lang1)
    doc2id = myDB.get_docid(doc2, lang2)
    p_lengths1 = myDB.get_all_paragraph_lengths(doc1id)
    p_lengths2 = myDB.get_all_paragraph_lengths(doc2id)
    bitext = []
    for para1, para2 in align_blocks([x[1] for x in p_lengths1],
                                     [x[1] for x in p_lengths2]):
        # convert from index in the list above to the ID in the DB:
        para1 = p_lengths1[para1][0]
        para2 = p_lengths2[para2][0]
        s_lengths1 = myDB.get_sentence_lengths(doc1id, para1)
        s_lengths2 = myDB.get_sentence_lengths(doc2id, para2)
        for s1, s2 in align_blocks([x[1] for x in s_lengths1],
                                   [x[1] for x in s_lengths2]):
            s1 = s_lengths1[s1][0]
            s2 = s_lengths2[s2][0]
            bitext.append((s1, s2))

    return [
        AlignedSent(s.split(), t.split())
        for (s, t) in myDB.get_sentence_pairs(bitext)
    ]
示例#5
0
def align_and_show(chap_en,
                   chap_ba,
                   en_output_path,
                   ba_output_path,
                   write=True):
    """

    :param write:
    :param ba_output_path:
    :param en_output_path:
    :param chap: chapter number (int)
    :param chap_en: list of texts in the corresponding English Chapter
    :param chap_ba: ... Basque ...
    :return:
    """
    def align_index(index_mapping):
        """

        :param index_mapping:   [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (7, 8), (8, 9)]
        :return:                [(0, [0]), (1, [1]), (2, [2]), (3, [3]), (4, [4]), (5, [5]), (6, [6]), (7, [7, 8]), (8, [9])]
        """
        al = {}
        for (e, g) in index_mapping:
            if e not in al: al[e] = []
            al[e].append(g)

        aligned_indexes = [(k, al[k]) for k in sorted(al.keys())]
        return aligned_indexes

    chap_en_leng = chapter_paras_length(
        chap_en
    )  # [105, 184, 150, 60, 113, 218, 88, 354, 138], length of each paragraph
    chap_ba_leng = chapter_paras_length(
        chap_ba)  # [ 101, 154, 121, 45, 94, 192, 80, 159, 130, 116]
    index_mapping = gale_church.align_blocks(chap_en_leng, chap_ba_leng)

    aligned_indexes = align_index(index_mapping)
    en_aligned_text_list, ba_aligned_text_list = aligned_indexes2aligned_texts(
        aligned_indexes,
        chap_en,
        chap_ba,
        en_output_path,
        ba_output_path,
        write=write)
    return en_aligned_text_list, ba_aligned_text_list
示例#6
0
def alignSentences(lang_source, lang_target, text_source, text_target):
    # get spacy models for language processing
    sp_source = utils.getSpacy(lang_source)
    sp_target = utils.getSpacy(lang_target)
    doc_source = sp_source(text_source)
    doc_target = sp_target(text_target)

    # if we use english, use load the embeddings in a single step to improve performance
    eng_doc = None
    if lang_source == utils.Lang.ENG:
        eng_doc = doc_source
    elif lang_target == utils.Lang.ENG:
        eng_doc = doc_target
    if not eng_doc is None:
        words = {t.lemma_.lower() for t in eng_doc if t.is_alpha}
        glove.getVector(words)

    sent_source = [sent.string.strip() for sent in doc_source.sents]
    sent_target = [sent.string.strip() for sent in doc_target.sents]
    len_source = [len(sent) for sent in sent_source]
    len_target = [len(sent) for sent in sent_target]
    alignment = gale_church.align_blocks(len_source, len_target)
    src_set = set()
    tgt_set = set()
    blocks = []
    for src_idx, tgt_idx in alignment:
        if src_idx in src_set or tgt_idx in tgt_set:
            src_set.add(src_idx)
            tgt_set.add(tgt_idx)
        else:
            if len(src_set) or len(tgt_set):
                src_block = ' '.join([sent_source[i] for i in sorted(src_set)])
                tgt_block = ' '.join([sent_target[i] for i in sorted(tgt_set)])
                blocks.append((src_block, tgt_block))
            src_set.clear()
            tgt_set.clear()
            src_set.add(src_idx)
            tgt_set.add(tgt_idx)
    if len(src_set) or len(tgt_set):
        src_block = ' '.join([sent_source[i] for i in sorted(src_set)])
        tgt_block = ' '.join([sent_target[i] for i in sorted(tgt_set)])
        blocks.append((src_block, tgt_block))
    return blocks
示例#7
0
    def __init__(self, doc_level_bitext):
        logging.debug("Creating paragraph level bitext from document pair")
        src_lengths = doc_level_bitext.src_doc.get_paragraph_lengths()
        tgt_lengths = doc_level_bitext.tgt_doc.get_paragraph_lengths()
        par_alignments = single_alignments_to_sequence_pairs(
            align_blocks(src_lengths, tgt_lengths))
        self.bi_par = []
        for a in par_alignments:
            if debug:
                print "my par align = " + str(a)
            new_src_par = doc_level_bitext.src_doc.paragraphs[a[0][0]]
            #print "current number of sentences in new src par = "+str(len(new_src_par.sentences))
            for i in range(1, len(a[0])):
                new_src_par += doc_level_bitext.src_doc.paragraphs[a[0][i]]

            new_tgt_par = doc_level_bitext.tgt_doc.paragraphs[a[1][0]]
            for i in range(1, len(a[1])):
                new_tgt_par += doc_level_bitext.tgt_doc.paragraphs[a[1][i]]
            self.bi_par.append((new_src_par, new_tgt_par))
        logging.info("%d paragraph alignments", len(self.bi_par))
示例#8
0
        s_lengths2 = myDB.get_sentence_lengths(doc2id, para2)
        for s1, s2 in align_blocks([x[1] for x in s_lengths1],
                                   [x[1] for x in s_lengths2]):
            s1 = s_lengths1[s1][0]
            s2 = s_lengths2[s2][0]
            bitext.append((s1, s2))

    return [
        AlignedSent(s.split(), t.split())
        for (s, t) in myDB.get_sentence_pairs(bitext)
    ]


def count_sentence_lengths(file, lengths):
    with open(file, 'r') as f:
        for line in f:
            line = line.rstrip()
            line = re.sub(" ", "", line)
            lengths.append(len(line))


# takes two one-segment-per-line files as input
if __name__ == '__main__':
    srclengths = []
    tgtlengths = []
    count_sentence_lengths(sys.argv[1], srclengths)
    count_sentence_lengths(sys.argv[2], tgtlengths)
    sent_alignments = align_blocks(srclengths, tgtlengths)
    for sp in sent_alignments:
        print sp
示例#9
0
def chap_to_sent_align(chap):
    en_paras = []
    ba_paras = []
    with open(
            'translation-dashboard/data/en-ba-para-align/en-chapter-' +
            str(chap) + '.txt', "r") as en_txt:
        for line in en_txt:  # each line is a paragraph
            en_paras.append(line)
            '''
            if("<para>" in line):
                loc = line.find("<para>")
                en_paras.append(line[:loc])
                en_paras.append(line[loc+6:])
            else:
                en_paras.append(line)
            '''
    with open(
            'translation-dashboard/data/en-ba-para-align/ba-chapter-' +
            str(chap) + '.txt', "r") as ba_txt:
        for line in ba_txt:  # each line is a paragraph
            ba_paras.append(line)
            '''
            if ("<para>" in line):
                loc = line.find("<para>")
                ba_paras.append(line[:loc])
                ba_paras.append(line[loc + 6:])
            else:
                ba_paras.append(line)
            '''

    en_sents_lens = []
    ba_sents_lens = []
    for para in en_paras:
        en_sents = stokenizer(para)
        for i in range(len(en_sents)):
            #get the number of words in the sentence
            en_sents_lens.append(len(en_sents[i].split()))
    for para in ba_paras:
        ba_sents = stokenizer(para)
        for i in range(len(ba_sents)):
            #get the number of words in the sentence
            ba_sents_lens.append(len(ba_sents[i].split()))

    def align_index(index_mapping):
        """

        :param index_mapping:   [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (7, 8), (8, 9)]
        :return:                [(0, [0]), (1, [1]), (2, [2]), (3, [3]), (4, [4]), (5, [5]), (6, [6]), (7, [7, 8]), (8, [9])]
        """
        al = {}
        for (e, g) in index_mapping:
            if e not in al: al[e] = []
            al[e].append(g)

        aligned_indexes = [(k, al[k]) for k in sorted(al.keys())]
        return aligned_indexes

    sent_align = gale_church.align_blocks(en_sents_lens, ba_sents_lens)
    aligned_indexes = align_index(sent_align)
    #print(aligned_indexes) #[(0, [0]), (1, [1]), (2, [2]), (3, [3]), (4, [4]), (5, [5]), (6, [6]), (7, [7]), (8, [8]), (9, [9]), (10, [10]), (11, [11]), (12, [12]), (13, [13]), (14, [14]), (15, [15]), (16, [16]), (17, [17]), (18, [18]), (19, [19]), (20, [20]), (21, [21]), (22, [22]), (23, [23]), (24, [24]), (25, [25]), (26, [26]), (27, [27]), (28, [28]), (29, [29, 30]), (30, [31]), (31, [32]), (32, [33]), (33, [34]), (34, [35]), (35, [36]), (36, [37]), (37, [38, 39]), (38, [40]), (39, [41]), (40, [42]), (41, [43]), (42, [44]), (43, [45]), (44, [46]), (45, [47]), (46, [48]), (47, [49]), (48, [50]), (49, [51]), (50, [52]), (51, [53, 54]), (52, [55]), (53, [56]), (54, [57]), (55, [58, 59]), (56, [60]), (57, [61]), (58, [62]), (59, [63]), (60, [64]), (61, [65]), (62, [66]), (63, [67]), (64, [68]), (65, [69]), (66, [70]), (67, [71]), (68, [72]), (69, [73]), (70, [74])]

    en_path = 'corpora/english-modified.txt'
    ba_folder_path = 'corpora/basque'
    chap_en = get_english_chapter(en_path, chap)
    chap_ba = get_basque_chapter(ba_folder_path, chap)
    en_output_path = 'translation-dashboard/data/en-ba-sent-align/en-chapter-' + str(
        chap) + '.txt'
    ba_output_path = 'translation-dashboard/data/en-ba-sent-align/ba-chapter-' + str(
        chap) + '.txt'

    en_aligned_text_list = []
    ba_aligned_text_list = []

    for para in chap_en:
        en_aligned_text_list.extend(
            stokenizer(para))  # produces list of all sentences
    for para in chap_ba:
        ba_aligned_text_list.extend(
            stokenizer(para))  # produces list of all sentences

    #writing
    with open(en_output_path, 'wt', encoding='utf-8') as en_file:
        for en, ba_list in aligned_indexes:
            en_file.write(en_aligned_text_list[en] + '\n')

    with open(ba_output_path, 'wt', encoding='utf-8') as ba_file:
        for en, ba_list in aligned_indexes:
            ba_txt_list = []
            if len(ba_list) > 1:
                for i in ba_list:
                    ba_txt_list.append(ba_aligned_text_list[i])
                ba_file.write('<sent>'.join(str(v)
                                            for v in ba_txt_list) + '\n')
            else:
                ba_file.write(ba_aligned_text_list[ba_list[0]] + '\n')
示例#10
0
 def align_block(self, block_n):
     fb = self.text_f.blocks[block_n]
     tb = self.text_t.blocks[block_n]
     self.aligned_blocks[block_n] = align_blocks(fb.stnc_lengths_char, tb.stnc_lengths_char)