def words_location(workbook, dict_of_frequencies): def write_words(sorted_list, sheet_name): worksheet = workbook.add_worksheet(sheet_name) row = 0 col = 0 for word, origin in sorted_list: worksheet.write(row, col, word) for i in range(len(origin)): worksheet.write(row, col + i + 1, origin[i]) row += 1 # b. alpha-sorted alpha_sorted = [(a, dict_of_frequencies[a]) for a in tib_sort(dict_of_frequencies)] write_words(alpha_sorted, 'བྱུང་ཁུངས་ལ་ཀ་ཁའི་གོ་རིམ།') # b. size_sorted = [(a, dict_of_frequencies[a]) for a in dict_of_frequencies] size_sorted = sorted(size_sorted, key=lambda x: len(x[0].split('་')), reverse=True) write_words(size_sorted, 'བྱུང་ཁུངས་ལ་རིང་ཐུང་གི་གོ་རིམ།')
def add_sorted_data(workbook, dict_of_frequencies, data_name): def write_data(data, sheet_name): worksheet = workbook.add_worksheet(sheet_name) row = 0 col = 0 for frequency, word in data: worksheet.write(row, col, frequency) worksheet.write(row, col + 1, word) row += 1 # a. frequency-sorted frequency_sorted = [(dict_of_frequencies[a], a) for a in dict_of_frequencies] frequency_sorted = sorted(frequency_sorted, key= lambda x: x[0], reverse=True) write_data(frequency_sorted, data_name + 'ཚིག་རྒྱུག་ཆེ་ཆུང་།') # b. alpha-sorted alpha_sorted = [(a, dict_of_frequencies[a]) for a in tib_sort(dict_of_frequencies)] write_data(alpha_sorted, data_name+'ཀ་ཁའི་གོ་རིམ།')
def format_footnote(note, chosen, ref): def agree_zhas(chosen_ed): last_syl = pre_process(chosen_ed, mode='syls')[-1] return Agreement().part_agreement(last_syl, 'ཞེས') def strip_punct(string): punct = [ '༄', '༅', '༆', '༇', '༈', '།', '༎', '༏', '༐', '༑', '༔', '་', ' ' ] while string != '' and string[-1] in punct: string = string[:-1] return string def clean_ed_text(ed): return ''.join(ed).replace(' ', '').replace('#', '').replace('_', ' ') def strip_similar_to_chosen(note, chosen_ed): stripped = {} for k, v in note.items(): if v != note[chosen_ed]: stripped[k] = v return stripped if chosen == 'K': try: stripped_note = strip_similar_to_chosen(note, 'སྡེ་') except KeyError: stripped_note = strip_similar_to_chosen(note, 'པེ་') ordered = defaultdict(list) for k, v in stripped_note.items(): ordered[strip_punct(clean_ed_text(v))].append(k) # ཞེས་པར་མ་གཞན་ནང་མེད། for all notes where Derge adds something. if not [a for a in ordered.keys() if a != '']: return '{}: {}། ཞེས་པར་མ་གཞན་ནང་མེད།'.format( ref, strip_punct(''.join(note['སྡེ་']))) else: final = [] full_names = { 'སྡེ་': 'སྡེ་དགེ', 'ཅོ་': 'ཅོ་ནེ', 'པེ་': 'པེ་ཅིན', 'སྣར་': 'སྣར་ཐང་' } for text in tib_sort(ordered.keys()): if text != '': final.append(text) final.extend([ full_names[ed] for ed in tib_sort(ordered[text]) if ed != 'ཞོལ་' ]) return '{}: {}།'.format(ref, '། '.join(final)) elif chosen == 'b': derge = strip_punct(clean_ed_text(note['སྡེ་'])) both = strip_punct(clean_ed_text(note['སྣར་'])) zhas = agree_zhas(both) return '{}: མ་དཔེར་{}། བྱུང་ཡང་པེ་ཅིན་དང་སྣར་ཐང་བཞིན། {}། {}་བཅོས།'.format( ref, derge, both, zhas) elif chosen == 'n': if note_num == 370: print('ok') derge = strip_punct(clean_ed_text(note['སྡེ་'])) narthang = strip_punct(clean_ed_text(note['སྣར་'])) zhas = agree_zhas(narthang) return '{}: མ་དཔེར་{}། བྱུང་ཡང་སྣར་ཐང་བཞིན། {}། {}་བཅོས།'.format( ref, derge, narthang, zhas) elif chosen == 'p': derge = strip_punct(clean_ed_text(note['སྡེ་'])) pecing = strip_punct(clean_ed_text(note['པེ་'])) zhas = agree_zhas(pecing) return '{}: མ་དཔེར་{}། བྱུང་ཡང་པེ་ཅིན་བཞིན། {}། {}་བཅོས།'.format( ref, derge, pecing, zhas)
# replace them with spaces text = [] for r in range(len(content)-1): line = content[r] for t in to_delete: line = line.replace(t, ' ') text.append(re.sub(r'\s+', r' ', line)) lexicon = [] for t in text: lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != '']) new.extend(lexicon) new = list(set(new)) oral_corpus_num = 0 extant_lexicon = [] extant_lexicon.extend(open_file('../updateJs/src/TDC.txt').split('\n')) extant_lexicon.extend(open_file('../updateJs/src/verbs.txt').split('\n')) extant_lexicon.extend(open_file('../updateJs/src/particles.txt').split('\n')) for f in os.listdir('../updateJs/src/new_entries/'): extant_lexicon.extend(open_file('../updateJs/src/new_entries/'+f).split('\n')) number = int(f.split('.')[0].split('_')[2]) if number > oral_corpus_num: oral_corpus_num = number new_entries = [n for n in new if n not in extant_lexicon] write_file(out_path+'all_entries{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new))) if new_entries: write_file('../updateJs/src/new_entries/oral_corpus_{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new_entries)))