Пример #1
0
def words_location(workbook, dict_of_frequencies):
    def write_words(sorted_list, sheet_name):
        worksheet = workbook.add_worksheet(sheet_name)
        row = 0
        col = 0
        for word, origin in sorted_list:
            worksheet.write(row, col, word)
            for i in range(len(origin)):
                worksheet.write(row, col + i + 1, origin[i])
            row += 1

    # b. alpha-sorted
    alpha_sorted = [(a, dict_of_frequencies[a]) for a in tib_sort(dict_of_frequencies)]
    write_words(alpha_sorted, 'བྱུང་ཁུངས་ལ་ཀ་ཁའི་གོ་རིམ།')
    # b.
    size_sorted = [(a, dict_of_frequencies[a]) for a in dict_of_frequencies]
    size_sorted = sorted(size_sorted, key=lambda x: len(x[0].split('་')), reverse=True)
    write_words(size_sorted, 'བྱུང་ཁུངས་ལ་རིང་ཐུང་གི་གོ་རིམ།')
Пример #2
0
def add_sorted_data(workbook, dict_of_frequencies, data_name):
    def write_data(data, sheet_name):
        worksheet = workbook.add_worksheet(sheet_name)
        row = 0
        col = 0
        for frequency, word in data:
            worksheet.write(row, col, frequency)
            worksheet.write(row, col + 1, word)
            row += 1

    # a. frequency-sorted
    frequency_sorted = [(dict_of_frequencies[a], a) for a in dict_of_frequencies]
    frequency_sorted = sorted(frequency_sorted, key= lambda x: x[0], reverse=True)
    write_data(frequency_sorted, data_name + 'ཚིག་རྒྱུག་ཆེ་ཆུང་།')

    # b. alpha-sorted
    alpha_sorted = [(a, dict_of_frequencies[a]) for a in tib_sort(dict_of_frequencies)]
    write_data(alpha_sorted, data_name+'ཀ་ཁའི་གོ་རིམ།')
def format_footnote(note, chosen, ref):
    def agree_zhas(chosen_ed):
        last_syl = pre_process(chosen_ed, mode='syls')[-1]
        return Agreement().part_agreement(last_syl, 'ཞེས')

    def strip_punct(string):
        punct = [
            '༄', '༅', '༆', '༇', '༈', '།', '༎', '༏', '༐', '༑', '༔', '་', ' '
        ]
        while string != '' and string[-1] in punct:
            string = string[:-1]
        return string

    def clean_ed_text(ed):
        return ''.join(ed).replace(' ', '').replace('#', '').replace('_', ' ')

    def strip_similar_to_chosen(note, chosen_ed):
        stripped = {}
        for k, v in note.items():
            if v != note[chosen_ed]:
                stripped[k] = v
        return stripped

    if chosen == 'K':
        try:
            stripped_note = strip_similar_to_chosen(note, 'སྡེ་')
        except KeyError:
            stripped_note = strip_similar_to_chosen(note, 'པེ་')
        ordered = defaultdict(list)
        for k, v in stripped_note.items():
            ordered[strip_punct(clean_ed_text(v))].append(k)
        # ཞེས་པར་མ་གཞན་ནང་མེད། for all notes where Derge adds something.
        if not [a for a in ordered.keys() if a != '']:
            return '{}: {}། ཞེས་པར་མ་གཞན་ནང་མེད།'.format(
                ref, strip_punct(''.join(note['སྡེ་'])))
        else:
            final = []
            full_names = {
                'སྡེ་': 'སྡེ་དགེ',
                'ཅོ་': 'ཅོ་ནེ',
                'པེ་': 'པེ་ཅིན',
                'སྣར་': 'སྣར་ཐང་'
            }
            for text in tib_sort(ordered.keys()):
                if text != '':
                    final.append(text)
                    final.extend([
                        full_names[ed] for ed in tib_sort(ordered[text])
                        if ed != 'ཞོལ་'
                    ])
            return '{}: {}།'.format(ref, '། '.join(final))

    elif chosen == 'b':
        derge = strip_punct(clean_ed_text(note['སྡེ་']))
        both = strip_punct(clean_ed_text(note['སྣར་']))
        zhas = agree_zhas(both)
        return '{}: མ་དཔེར་{}། བྱུང་ཡང་པེ་ཅིན་དང་སྣར་ཐང་བཞིན། {}། {}་བཅོས།'.format(
            ref, derge, both, zhas)
    elif chosen == 'n':
        if note_num == 370:
            print('ok')
        derge = strip_punct(clean_ed_text(note['སྡེ་']))
        narthang = strip_punct(clean_ed_text(note['སྣར་']))
        zhas = agree_zhas(narthang)
        return '{}: མ་དཔེར་{}། བྱུང་ཡང་སྣར་ཐང་བཞིན། {}། {}་བཅོས།'.format(
            ref, derge, narthang, zhas)
    elif chosen == 'p':
        derge = strip_punct(clean_ed_text(note['སྡེ་']))
        pecing = strip_punct(clean_ed_text(note['པེ་']))
        zhas = agree_zhas(pecing)
        return '{}: མ་དཔེར་{}། བྱུང་ཡང་པེ་ཅིན་བཞིན། {}། {}་བཅོས།'.format(
            ref, derge, pecing, zhas)
    # replace them with spaces
    text = []
    for r in range(len(content)-1):
        line = content[r]
        for t in to_delete:
            line = line.replace(t, ' ')
        text.append(re.sub(r'\s+', r' ', line))

    lexicon = []
    for t in text:
        lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != ''])
    new.extend(lexicon)
new = list(set(new))

oral_corpus_num = 0
extant_lexicon = []
extant_lexicon.extend(open_file('../updateJs/src/TDC.txt').split('\n'))
extant_lexicon.extend(open_file('../updateJs/src/verbs.txt').split('\n'))
extant_lexicon.extend(open_file('../updateJs/src/particles.txt').split('\n'))
for f in os.listdir('../updateJs/src/new_entries/'):
    extant_lexicon.extend(open_file('../updateJs/src/new_entries/'+f).split('\n'))
    number = int(f.split('.')[0].split('_')[2])
    if number > oral_corpus_num:
        oral_corpus_num = number

new_entries = [n for n in new if n not in extant_lexicon]

write_file(out_path+'all_entries{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new)))
if new_entries:
    write_file('../updateJs/src/new_entries/oral_corpus_{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new_entries)))