def generate_editions(editions, out_dir, work_name): # writing all the editions in their respective folder for e in editions: path = out_dir / 'editions' / e.replace('་', '།') file_name = work_name + '_' + e + '.txt' content = ''.join([e[0] for e in editions[e]]).replace('_', ' ') write_file(path / file_name, content)
def export_unified_structure(editions, text_name, out_dir=outDir / 'unified_structure'): unified = generate_unified_version(editions) out = yaml.dump(unified, allow_unicode=True, default_flow_style=False, width=float("inf")) write_file(out_dir / f'{text_name}_unified_structure.yaml', out)
def copy_derge_layout(derge_layout): for f in os.listdir('output'): no_layout = True if f in [a.replace('_raw_page_reinserted.txt', '') for a in os.listdir(derge_layout)]: content = open_file('{}/{}_raw_page_reinserted.txt'.format(derge_layout, f)) reformated = re.sub(r'\n-+', '', content).replace('\\', '') write_file('output/{}/{}'.format(f, f+'_derge_layout.txt'), reformated) no_layout = False if no_layout and f in [a.replace('_with_a.txt', '') for a in os.listdir('../4-a-final_formatting/output/2-0-with_a')]: content = open_file('../4-a-final_formatting/output/2-0-with_a/{}_with_a.txt'.format(f)) reformated = content.replace('\n', ' ').replace('a', '\n') write_file('output/{}/{}'.format(f, f + '_derge_lines.txt'), reformated)
def reconstruct_version_texts(in_path): for f in os.listdir(in_path): text_name = f.strip('_updated_structure.txt') # create a folder for the layered files if missing if text_name in os.listdir('output'): current_out_folder = 'output/' + text_name # open structure file from_structure = yaml.load(open_file('{}/{}'.format(in_path, f))) # reconstruct the editions editions = reconstruct_edition_versions(from_structure) # write them in the corresponding folder for ed, version in editions.items(): version = version.replace('_', ' ') # reconstruct spaces write_file('{}/{}_{}_layer.txt'.format(current_out_folder, text_name, ed), version)
def process(in_path, file_origin, name_end, out_path): for f in os.listdir(in_path): work_name = f.replace(name_end, '') # raw_content = open_file(file_origin.format(work_name.replace('_', ' '))) try: raw_content = open_file(file_origin.format(work_name)) except FileNotFoundError: continue content = re.sub(r'\n?[0-9]+\.\s+', '', raw_content) content = re.sub(r' ', '\n', content) write_file(out_path[0].format(work_name), content) content = content.replace('a', '') write_file(out_path[1].format(work_name), content)
def reinsert_raw(in_path, out_path, patterns): print('raw reinsertion') for f in os.listdir(in_path): work_name = f.replace('_with_a.txt', '') print(work_name) if work_name in patterns: print('\t', work_name) content = open_file('{}/{}'.format(in_path, f)) lines = deque(content.replace('\n', ' ').split('a')) pages = [] text_pattern = patterns[work_name][2:] counter = patterns[work_name][0][1] side = patterns[work_name][0][2] # beginning pages for num in text_pattern[0]: pages.append(create_page(lines, num, counter, side)) counter, side = increment_counter(counter, side) # body of the text while len(lines) > 0: if len(lines) >= text_pattern[1]: pages.append( create_page(lines, text_pattern[1], counter, side)) counter, side = increment_counter(counter, side) elif text_pattern[2] == len(lines): pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) else: pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) output = '\n{}\n'.format('-' * 100).join(pages) write_file( '{}/{}_raw_page_reinserted.txt'.format(out_path, work_name), output) print('2-2!')
# replace them with spaces text = [] for r in range(len(content)-1): line = content[r] for t in to_delete: line = line.replace(t, ' ') text.append(re.sub(r'\s+', r' ', line)) lexicon = [] for t in text: lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != '']) new.extend(lexicon) new = list(set(new)) oral_corpus_num = 0 extant_lexicon = [] extant_lexicon.extend(open_file('../updateJs/src/TDC.txt').split('\n')) extant_lexicon.extend(open_file('../updateJs/src/verbs.txt').split('\n')) extant_lexicon.extend(open_file('../updateJs/src/particles.txt').split('\n')) for f in os.listdir('../updateJs/src/new_entries/'): extant_lexicon.extend(open_file('../updateJs/src/new_entries/'+f).split('\n')) number = int(f.split('.')[0].split('_')[2]) if number > oral_corpus_num: oral_corpus_num = number new_entries = [n for n in new if n not in extant_lexicon] write_file(out_path+'all_entries{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new))) if new_entries: write_file('../updateJs/src/new_entries/oral_corpus_{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new_entries)))
def generate_context_versions(editions, file_name, out_dir, left=5, right=5, base_ed='སྡེ་'): def calculate_contexts(unified_version, left=5, right=5, base_ed='སྡེ་'): all_versions = [] c = 0 for num, syl in enumerate(unified_version): if type(syl) == dict: if c == 137: print('ok') versions = {} for ed in syl: # add left context n_l = num - left if n_l < 0: n_l = 0 left_context = unified_version[n_l:num] # add note note = syl[ed] # add right context n_r = num + right + 1 if n_r > len(unified_version) - 1: n_r = len(unified_version) - 1 right_context = unified_version[num + 1:n_r] version = left_context + note + right_context # if there is a note (if version[v] == dict), choose the base_ed version no_note_version = [] for v in version: if type(v) == dict: for base_syl in v[base_ed]: no_note_version.append(base_syl) else: no_note_version.append(v) # add the versions in the versions versions[ed] = ''.join(no_note_version).replace('_', ' ') c += 1 versions[str(c)] = '' all_versions.append(versions) return all_versions unified = generate_unified_version(editions) with_context = calculate_contexts(unified, left=left, right=right, base_ed=base_ed) for i in range(len(with_context)): with_context[i] = [[a, with_context[i][a]] for a in sorted(with_context[i])] output = yaml.dump_all(with_context, allow_unicode=True, default_flow_style=False, width=float("inf")) # reformat the page number output = re.sub(r'\n- -([^\n]+)\n -', r'\n\1: ', output) output = re.sub(r"---\n '([0-9]+)': ''", r'-\1-', output) output = re.sub(r"- - '1'\n - ''", r'-1-', output).replace(" '", '').replace("'", '') output = re.sub(r'\n', r',,,,,,,,,,,,,,,\n', output) # Todo write_file(out_dir / f'/conc_yaml/{file_name}_conc.txt', output)
def copy_cat_json_file(json_path): for f in os.listdir('output'): write_file('output/{}/{}'.format(f, f+'_cats.json'), open_file('{}/{}_cats.json'.format(json_path, f)))
output.append(note) notes.append(format_footnote(s, decision, ref)) note_map.append('K') stats[decision] += 1 if grouped_unified[num] == s: similar_notes += 1 else: note_map.append('0') stats[decision] += 1 if grouped_unified[num] == s: similar_notes += 1 continue prepared = ''.join(output).replace(' ', '').replace('#', '').replace( '_', ' ').replace(' ', '\n') write_file('output/0-1-formatted/{}_formatted.txt'.format(work_name), prepared + '\n\n' + '\n'.join(notes)) write_file('output/0-3-corrected/{}_corrected.txt'.format(work_name), prepared + '\n\n' + '\n'.join(notes)) # Stats total = 0 for kind, value in stats.items(): total += value percentages = {} for kind, value in stats.items(): percentages[kind] = (value, value * 100 / total) discarted_notes = percentages['D'][0] + percentages['U'][0] kept_notes = percentages['C'][0] + percentages['K'][0] + percentages[ '?'][0] statistics = []
missing_space = r'^([^ -])' missing_space_rpl = r' \1' missing_tsek = r'(ཅོ|སྣར|སྡེ|པེ)(?=):' missing_tsek_rpl = r'\1་:' files = [a for a in os.listdir('.') if a != 'conc_sanity_check.py'] for f in files: print(f) raw = open_file('./' + f) raw = re.sub(missing_space, missing_space_rpl, raw) raw = re.sub(missing_tsek, missing_tsek_rpl, raw) raw = raw.replace('-1-,,,,,,,,,,,,,,,', '-1-,,,,,,,,,,,,,,,') write_file('./' + f, raw) lines = raw.split('\n') for num, line in enumerate(lines): toprint = False if line.startswith('-'): pass elif line.startswith(r' ཅོ་:'): pass elif line.startswith(r' སྣར་:'): pass elif line.startswith(' སྡེ་:'): pass elif line.startswith(' པེ་:'): pass elif num == len(lines)-1 and line.strip() == '': pass
def create_base_text(raw_path): for f in os.listdir('output'): content = open_file('{}/{}_raw.txt'.format(raw_path, f)) # put back in a single line content = content.replace('\n', ' ') write_file('output/{}/{}_base.txt'.format(f, f), content)
def reinsert(in_path, out_path1, out_path2, patterns): print('reinsertion with notes') for f in os.listdir(in_path): work_name = f.replace('_a_reinserted.txt', '') if work_name in patterns: print('\t', work_name) content = open_file('{}/{}'.format(in_path, f)) if not re.findall(r'\n\[\^[0-9A-Z]+\]\:', content): text = content notes = '' else: text, notes = [ a for a in re.split( r'((?:\n?\[\^[0-9A-Z]+\]\:[^\n]+\n?)+)', content) if a != '' ] lines = deque(text.replace('\n', ' ').split('a')) pages = [] text_pattern = patterns[work_name][2:] counter = patterns[work_name][0][1] side = patterns[work_name][0][2] # beginning pages for num in text_pattern[0]: pages.append(create_page(lines, num, counter, side)) counter, side = increment_counter(counter, side) # body of the text while len(lines) > 0: if len(lines) >= text_pattern[1]: pages.append( create_page(lines, text_pattern[1], counter, side)) counter, side = increment_counter(counter, side) elif text_pattern[2] == len(lines): pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) else: print( 'There is a line number issue: only {} lines were left for the last page.' .format(len(lines))) pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) output = '\n{}\n'.format('-' * 100).join(pages) + '\n\n' + notes write_file( '{}/{}_page_reinserted.txt'.format(out_path1, work_name), output) # write to the file to 3-2-compared if it is not yet there existing = [ g.replace('_compared.txt', '') for g in os.listdir(out_path2) if g.endswith('.txt') ] #if work_name not in existing: write_file('{}/{}_compared.txt'.format(out_path2, work_name), output) text_path = '{}/extra_copies/{}'.format(out_path2, work_name) if not os.path.exists(text_path): os.makedirs(text_path)
# write individual files for each text, presenting the mistakes in total frequency order len_ordered_mistakes = sorted(total, key=lambda x: len(total[x]), reverse=True) for f in os.listdir(in_path): if f.endswith('txt'): current_text = f.replace('.txt', '') # filter mistakes of the current file output = [] for mis in len_ordered_mistakes: tmp = [] for occ in total[mis]: if current_text == occ[0]: tmp.append(''.join(occ[1][0]) + mis + ''.join(occ[1][1])) if tmp: output.append('\n'.join([mis, '\n'.join(tmp)])) write_file('segmented/{}_segmented.txt'.format(current_text), '\n\n'.join(output)) # write total_formatted = [] for mis in len_ordered_mistakes: tmp = [] for occ in total[mis]: tmp.append(''.join(occ[1][0]) + mis + ''.join(occ[1][1])) if tmp: total_formatted.append('\n'.join( [' {} {}'.format(mis, len(total[mis])), '\n'.join(tmp)])) total_len = ', '.join([m + str(len(total[m])) for m in len_ordered_mistakes]).replace('#', '') write_file('total_mistakes.txt', total_len + '\n' + '\n\n'.join(total_formatted))
import sys, os grandParentDir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(grandParentDir) from PyTib.common import open_file, write_file import os in_path = 'output/antconc_format' out_path = '../3-b-reviewed_texts' for f in os.listdir(in_path): name = f.replace('_antconc_format.txt', '') print(name) content = open_file('{}/{}'.format(in_path, f)).strip() lines = content.split('\n') output = [ 'Left,p,c,d,n,right,new,min_mod,particles,spelling_mistake,sskrt,verb,?,empty,double,profile,ngram_freq,file name,note_num' ] for line in lines: columns = line.split('\t') columns[6] = 'K' output.append(','.join(columns)) write_file('{}/{}_DUCKed.csv'.format(out_path, name), '\n'.join(output))
def process(in_path, template_path, total_stats): global collection_eds, file, debug raw_template = open_file(template_path) verbs = jp.decode(open_file('./resources/monlam_verbs.json')) all_ngrams = open_ngrams() files = find_file_path(in_path, '../1-a-reinsert_notes/output/conc_yaml') # print(files) for filename in files: # if 'N5000' not in filename: # continue f = filename.split('/')[-1] print(f) if debug and f != file: continue work_name = f.replace('_conc.txt', '').replace('.txt', '') raw = open_file(filename) # setting collection_eds for the current file collection_eds = list({a for a in re.findall(r' ([^ ]+): ', raw)}) if len(collection_eds) > 4: print(collection_eds) data = prepare_data(raw) profiles, profile_cats = find_profiles(data) # prepare prepared = find_all_parts(data) # categorise categorised_notes = jp.decode(raw_template) # find ngram frequencies frequencies = ngram_frequency(prepared, all_ngrams) if debug: if file == f and note_num != 0: for note in prepared: if note[0] == note_num: categorise(note, categorised_notes, verbs) elif file == f: for note in prepared: categorise(note, categorised_notes, verbs) else: for note in prepared: categorise(note, categorised_notes, verbs) # finally write the json file stats = {} total = 0 for key1, item1 in sorted(categorised_notes.items()): if type(item1) == list: if len(item1) != 0: stats[key1] = len(item1) total += len(item1) else: stats[key1] = {} for key2, item2 in sorted(item1.items()): if type(item2) == list: if len(item2) != 0: stats[key1][key2] = len(item2) total += len(item2) else: stats[key1][key2] = {} for key3, item3 in sorted(item2.items()): if type(item3) == list: if len(item3) != 0: stats[key1][key2][key3] = len(item3) total += len(item3) else: stats[key1][key2][key3] = {} for key4, item4 in sorted(item3.items()): if type(item4) == list: if len(item4) != 0: stats[key1][key2][key3][ key4] = len(item4) total += len(item4) stats['Notes’ total'] = total categorised = total if 'long_diff' in stats['dunno'].keys(): categorised -= stats['dunno']['long_diff'] if 'short_diff' in stats['dunno'].keys(): categorised -= stats['dunno']['short_diff'] if 'no_diff' in stats['dunno'].keys(): categorised -= stats['dunno']['no_diff'] if total == 0: percentage = 0 print('the notes were not processed!') else: percentage = categorised * 100 / total stats['Categorised'] = '{} notes ({:02.2f}%)'.format( categorised, percentage) stats['Profiles'] = profile_cats total_stats.append('{}\n{}'.format(work_name, jp.encode(stats))) encoded = jp.encode(categorised_notes) if encoded != raw_template: categorised_notes['Stats'] = stats categorised_notes['profile'] = profiles categorised_notes['ngram_freq'] = frequencies write_file('output/{}_cats.json'.format(work_name), jp.encode(categorised_notes))
percentage = 0 print('the notes were not processed!') else: percentage = categorised * 100 / total stats['Categorised'] = '{} notes ({:02.2f}%)'.format( categorised, percentage) stats['Profiles'] = profile_cats total_stats.append('{}\n{}'.format(work_name, jp.encode(stats))) encoded = jp.encode(categorised_notes) if encoded != raw_template: categorised_notes['Stats'] = stats categorised_notes['profile'] = profiles categorised_notes['ngram_freq'] = frequencies write_file('output/{}_cats.json'.format(work_name), jp.encode(categorised_notes)) if __name__ == '__main__': debug = False # file = '563_རྒྱུད་ཀྱི་རྒྱལ་པོ་ཆེན་པོ་དཔལ་དགྱེས་པའི་རྡོ་རྗེའི་དཀའ་འགྲེལ་སྤྱན་འབྱེད།_conc.txt' file = '' note_num = 0 in_path = '../1-b-manually_corrected_conc/notes_formatted' template = 'resources/template.json' total_stats = [] process(in_path, template, total_stats) write_file('total_stats.txt', '\n\n'.join(total_stats))
def copy_final_version(final_path): for f in os.listdir('output'): if f+'_final.txt' in os.listdir('../4-a-final_formatting/output/3-3-final'): write_file('output/{}/{}'.format(f, f+'_final.txt'), open_file('{}/{}_final.txt'.format(final_path, f)))