def copy_derge_layout(derge_layout): for f in os.listdir('output'): no_layout = True if f in [a.replace('_raw_page_reinserted.txt', '') for a in os.listdir(derge_layout)]: content = open_file('{}/{}_raw_page_reinserted.txt'.format(derge_layout, f)) reformated = re.sub(r'\n-+', '', content).replace('\\', '') write_file('output/{}/{}'.format(f, f+'_derge_layout.txt'), reformated) no_layout = False if no_layout and f in [a.replace('_with_a.txt', '') for a in os.listdir('../4-a-final_formatting/output/2-0-with_a')]: content = open_file('../4-a-final_formatting/output/2-0-with_a/{}_with_a.txt'.format(f)) reformated = content.replace('\n', ' ').replace('a', '\n') write_file('output/{}/{}'.format(f, f + '_derge_lines.txt'), reformated)
def open_ngrams(): ngrams = {} for i in range(1, 13): lines = open_file('./resources/kangyur_ngrams/{}-grams_raw.txt'.format( i)).strip().split('\n') for line in lines: parts = line.split(' ') text = ''.join(parts[:-1]).strip().strip('་') ngrams[text] = int(parts[-1]) return ngrams
def raw_folder_content(path, num=10): files_content = {} for f in os.listdir(path): full_path = '{}/{}'.format(path, f) # open file raw = open_file(full_path) if num != 0: raw = first_parts(raw, num) files_content[full_path] = raw return files_content
def generate_outputs(text_name, notes_name, context, in_dir=inDir, out_dir=outDir): # extract text and reinsert notes editions = reinsert_notes(open_file(in_dir / text_name), open_file(in_dir / notes_name).replace(';', ',')) work_name = text_name.split('.')[0].replace(' ', '_') print(work_name) generate_editions(editions, out_dir, work_name) export_unified_structure(editions, work_name) generate_context_versions(editions, work_name, out_dir, left=context, right=context)
def reconstruct_version_texts(in_path): for f in os.listdir(in_path): text_name = f.strip('_updated_structure.txt') # create a folder for the layered files if missing if text_name in os.listdir('output'): current_out_folder = 'output/' + text_name # open structure file from_structure = yaml.load(open_file('{}/{}'.format(in_path, f))) # reconstruct the editions editions = reconstruct_edition_versions(from_structure) # write them in the corresponding folder for ed, version in editions.items(): version = version.replace('_', ' ') # reconstruct spaces write_file('{}/{}_{}_layer.txt'.format(current_out_folder, text_name, ed), version)
def process(in_path, file_origin, name_end, out_path): for f in os.listdir(in_path): work_name = f.replace(name_end, '') # raw_content = open_file(file_origin.format(work_name.replace('_', ' '))) try: raw_content = open_file(file_origin.format(work_name)) except FileNotFoundError: continue content = re.sub(r'\n?[0-9]+\.\s+', '', raw_content) content = re.sub(r' ', '\n', content) write_file(out_path[0].format(work_name), content) content = content.replace('a', '') write_file(out_path[1].format(work_name), content)
def find_string(in_path, string): """wrapper function """ found = [] for f in os.listdir(in_path): raw = open_file('{}/{}'.format(in_path, f)) raw = clean_string(raw, strip=True, del_returns=True) # unsegment the files raw = clean_string(raw, del_dashes=True, del_spaces=True) if string in raw: found.append(f) if found == []: return 'Nothing found.\n' else: return 'The following files were found:\n{}\n'.format('\n'.join(found))
def reinsert_raw(in_path, out_path, patterns): print('raw reinsertion') for f in os.listdir(in_path): work_name = f.replace('_with_a.txt', '') print(work_name) if work_name in patterns: print('\t', work_name) content = open_file('{}/{}'.format(in_path, f)) lines = deque(content.replace('\n', ' ').split('a')) pages = [] text_pattern = patterns[work_name][2:] counter = patterns[work_name][0][1] side = patterns[work_name][0][2] # beginning pages for num in text_pattern[0]: pages.append(create_page(lines, num, counter, side)) counter, side = increment_counter(counter, side) # body of the text while len(lines) > 0: if len(lines) >= text_pattern[1]: pages.append( create_page(lines, text_pattern[1], counter, side)) counter, side = increment_counter(counter, side) elif text_pattern[2] == len(lines): pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) else: pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) output = '\n{}\n'.format('-' * 100).join(pages) write_file( '{}/{}_raw_page_reinserted.txt'.format(out_path, work_name), output) print('2-2!')
else: out.append(t.content) else: out.append(t.content.replace(' ', '_')) return ' '.join(out) in_path = 'out' out_path = 'segmented' # populate total with the mistakes of all files in in_path total = defaultdict(list) for f in sorted(os.listdir(in_path)): if f.endswith('txt'): work_name = f.replace('.txt', '') print(work_name) content = open_file('{}/{}'.format(in_path, f)) content = rawify(content) # segmented = PyTib.Segment().segment(content) pybo_segmented = pybo_segment(content) mistakes = mistake_conc(pybo_segmented, work_name) for k, v in mistakes.items(): if not contains_sskrt(k): total[k].extend(v) # write individual files for each text, presenting the mistakes in total frequency order len_ordered_mistakes = sorted(total, key=lambda x: len(total[x]), reverse=True) for f in os.listdir(in_path): if f.endswith('txt'): current_text = f.replace('.txt', '') # filter mistakes of the current file output = []
reviewed_path = '../3-b-reviewed_texts' structure_path = '../3-a-revision_format/output/updated_structure' limit = False for f in sorted(os.listdir(reviewed_path)): print(f) if f: # == '1-1_ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ།_DUCKed.csv': work_name = f.replace('_DUCKed.csv', '') if work_name == 'N3118': limit = True if not limit: continue note_choice = parse_decisions( open_file('{}/{}'.format(reviewed_path, f))) # parse the file to keep only the decision and the note number try: updated_structure = yaml.load( open_file('{}/{}_updated_structure.txt'.format( structure_path, work_name))) except FileNotFoundError: continue try: unified_structure = yaml.load( open_file( '../1-a-reinsert_notes/output/unified_structure/{}_unified_structure.yaml' .format(work_name))) except FileNotFoundError: continue
import os import re from PyTib.common import open_file, write_file missing_space = r'^([^ -])' missing_space_rpl = r' \1' missing_tsek = r'(ཅོ|སྣར|སྡེ|པེ)(?=):' missing_tsek_rpl = r'\1་:' files = [a for a in os.listdir('.') if a != 'conc_sanity_check.py'] for f in files: print(f) raw = open_file('./' + f) raw = re.sub(missing_space, missing_space_rpl, raw) raw = re.sub(missing_tsek, missing_tsek_rpl, raw) raw = raw.replace('-1-,,,,,,,,,,,,,,,', '-1-,,,,,,,,,,,,,,,') write_file('./' + f, raw) lines = raw.split('\n') for num, line in enumerate(lines): toprint = False if line.startswith('-'): pass elif line.startswith(r' ཅོ་:'): pass elif line.startswith(r' སྣར་:'): pass elif line.startswith(' སྡེ་:'): pass
import sys, os from PyTib.common import open_file import os bad = [] for f in os.listdir('./input'): if f.endswith('Csv'): content = open_file('./input/' + f) if content.startswith( 'མཚན་བྱང་།,པར་གྲངས།,ཤོག་གྲངས།,1,མཚན་གྲངས།,པར་མ།,མཆན།' ) or content.startswith( 'མཚན་བྱང་།,པར་གྲངས།,ཤོག་གྲངས།,1,མཆན་གྲངས།,པར་མ།,མཆན།'): bad.append(f) for b in sorted(bad): print(b) print(f'No. of Bad files: {len(bad)}')
def reinsert(in_path, out_path1, out_path2, patterns): print('reinsertion with notes') for f in os.listdir(in_path): work_name = f.replace('_a_reinserted.txt', '') if work_name in patterns: print('\t', work_name) content = open_file('{}/{}'.format(in_path, f)) if not re.findall(r'\n\[\^[0-9A-Z]+\]\:', content): text = content notes = '' else: text, notes = [ a for a in re.split( r'((?:\n?\[\^[0-9A-Z]+\]\:[^\n]+\n?)+)', content) if a != '' ] lines = deque(text.replace('\n', ' ').split('a')) pages = [] text_pattern = patterns[work_name][2:] counter = patterns[work_name][0][1] side = patterns[work_name][0][2] # beginning pages for num in text_pattern[0]: pages.append(create_page(lines, num, counter, side)) counter, side = increment_counter(counter, side) # body of the text while len(lines) > 0: if len(lines) >= text_pattern[1]: pages.append( create_page(lines, text_pattern[1], counter, side)) counter, side = increment_counter(counter, side) elif text_pattern[2] == len(lines): pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) else: print( 'There is a line number issue: only {} lines were left for the last page.' .format(len(lines))) pages.append(create_page(lines, len(lines), counter, side)) counter, side = increment_counter(counter, side) output = '\n{}\n'.format('-' * 100).join(pages) + '\n\n' + notes write_file( '{}/{}_page_reinserted.txt'.format(out_path1, work_name), output) # write to the file to 3-2-compared if it is not yet there existing = [ g.replace('_compared.txt', '') for g in os.listdir(out_path2) if g.endswith('.txt') ] #if work_name not in existing: write_file('{}/{}_compared.txt'.format(out_path2, work_name), output) text_path = '{}/extra_copies/{}'.format(out_path2, work_name) if not os.path.exists(text_path): os.makedirs(text_path)
in_path = './output/2-1-a_reinserted' out_path1 = './output/3-1-page_reinserted' out_path2 = './output/3-2-compared' raw_in_path = './output/2-0-with_a' raw_out_path = './output/2-2-raw_page_reinserted' # '': [ # ('start', 0, ''), # page start + front/back # ('end', 0, ''), # idem # [0], # list of lines per page for the beginning of the text # 0, # general number of lines per page # 0 # number of lines pertaining to the current text on the last page # ] patterns_raw = open_file( 'C:/Users/trinley/github/canon_notes/4-a-final_formatting/resources/དཀར་ཆག་ཀུན་གསལ་མེ་ལོང་།.csv' ).strip().split('\n') patterns = {} for line in patterns_raw[1:]: parts = line.split('\t') names = patterns_raw[0].split('\t') # 'ཆོས་ཚན།' title = re.sub(r'_conc.*', '', parts[1]) # 'དབུ།' b_page = int(parts[2]) sides = {'a': 'ན', 'b': 'བ'} # 'རྒྱབ་འདུན།' (of 'དབུ།') b_side = sides[parts[3]] # 'ཞབས།' e_page = int(parts[6]) # 'རྒྱབ་འདུན།' (of 'ཞབས།')
import PyTib from PyTib.common import non_tib_chars, open_file, write_file, tib_sort, bisect_left import os import re in_path = 'input/' out_path = 'output/' new = [] for f in os.listdir(in_path): content = open_file(in_path+f).replace('༌', '་').split('\n') content = [a.strip() for a in content if a != ''] # find all non-tibetan characters to_delete = [] for c in content: for char in c: if char not in to_delete and non_tib_chars(char): to_delete.append(char) # add punctuation to be deleted to_delete.extend(['།', '༎', '༄', '༅', '༑']) # replace them with spaces text = [] for r in range(len(content)-1): line = content[r] for t in to_delete: line = line.replace(t, ' ') text.append(re.sub(r'\s+', r' ', line)) lexicon = [] for t in text: lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != ''])
def processing_corpus(corpus_path): print('processing corpus') # variables for the main xslx corpus_frequencies = {} corpus_total_frequency = defaultdict(int) corpus_origin = defaultdict(list) # variables for individual xslx files persons_frequencies = {} persons_total_frequency = {} persons_origin = {} for f in os.listdir(corpus_path): # A. finding the names for the current file file_name = f # finding the name of the section section = '' for c in corpus_sections: if c in f: section = c file_name = file_name.replace(c, '') if section == '': section = 'ཁུངས་མེད།' # finding the name of the person person = re.sub(r'[0-9]* *\([0-9]+\).txt', '', file_name).strip() # B. initiating the entries in dicts if missing for the current file # creating an entry in corpus_frequencies{} if it does not exist if section not in corpus_frequencies.keys(): corpus_frequencies[section] = defaultdict(int) # creating an entry in persons_frequencies{} if it does not exist if person not in persons_frequencies.keys(): persons_frequencies[person] = {} if section not in persons_frequencies[person].keys(): persons_frequencies[person][section] = defaultdict(int) # creating an entry in persons_total_frequency if it does not exist if person not in persons_total_frequency.keys(): persons_total_frequency[person] = defaultdict(int) # creating an entry in persons_origin{} if it does not exist if person not in persons_origin.keys(): persons_origin[person] = defaultdict(list) # C. processing the current file content = open_file(corpus_path+'/'+f).replace('༌', '་').split('\n') content = [a.strip() for a in content if a != ''] # find all non-tibetan characters to_delete = chars_to_delete(content) # replace them with spaces text = [] for r in range(len(content)): line = content[r] for t in to_delete: line = line.replace(t, ' ') text.append(re.sub(r'\s+', r' ', line).strip()) # D. filling in the corpus and personal variables # split the line in words and add it to the persons_frequencies and to the newspaper dict for t in text: split_line = [u.rstrip('་')+'་' for u in t.split(' ') if u.rstrip('་') != ''] for word in split_line: clean_word = word.lstrip('་') # corpus stats corpus_frequencies[section][clean_word] += 1 corpus_total_frequency[clean_word] += 1 if f not in corpus_origin[clean_word]: corpus_origin[clean_word].append(f) # persons’ stats persons_frequencies[person][section][clean_word] += 1 persons_total_frequency[person][clean_word] += 1 if f not in persons_origin[person][clean_word]: persons_origin[person][clean_word].append(f) return corpus_frequencies, corpus_total_frequency, corpus_origin, persons_frequencies, persons_total_frequency, persons_origin
def process(in_path, template_path, total_stats): global collection_eds, file, debug raw_template = open_file(template_path) verbs = jp.decode(open_file('./resources/monlam_verbs.json')) all_ngrams = open_ngrams() files = find_file_path(in_path, '../1-a-reinsert_notes/output/conc_yaml') # print(files) for filename in files: # if 'N5000' not in filename: # continue f = filename.split('/')[-1] print(f) if debug and f != file: continue work_name = f.replace('_conc.txt', '').replace('.txt', '') raw = open_file(filename) # setting collection_eds for the current file collection_eds = list({a for a in re.findall(r' ([^ ]+): ', raw)}) if len(collection_eds) > 4: print(collection_eds) data = prepare_data(raw) profiles, profile_cats = find_profiles(data) # prepare prepared = find_all_parts(data) # categorise categorised_notes = jp.decode(raw_template) # find ngram frequencies frequencies = ngram_frequency(prepared, all_ngrams) if debug: if file == f and note_num != 0: for note in prepared: if note[0] == note_num: categorise(note, categorised_notes, verbs) elif file == f: for note in prepared: categorise(note, categorised_notes, verbs) else: for note in prepared: categorise(note, categorised_notes, verbs) # finally write the json file stats = {} total = 0 for key1, item1 in sorted(categorised_notes.items()): if type(item1) == list: if len(item1) != 0: stats[key1] = len(item1) total += len(item1) else: stats[key1] = {} for key2, item2 in sorted(item1.items()): if type(item2) == list: if len(item2) != 0: stats[key1][key2] = len(item2) total += len(item2) else: stats[key1][key2] = {} for key3, item3 in sorted(item2.items()): if type(item3) == list: if len(item3) != 0: stats[key1][key2][key3] = len(item3) total += len(item3) else: stats[key1][key2][key3] = {} for key4, item4 in sorted(item3.items()): if type(item4) == list: if len(item4) != 0: stats[key1][key2][key3][ key4] = len(item4) total += len(item4) stats['Notes’ total'] = total categorised = total if 'long_diff' in stats['dunno'].keys(): categorised -= stats['dunno']['long_diff'] if 'short_diff' in stats['dunno'].keys(): categorised -= stats['dunno']['short_diff'] if 'no_diff' in stats['dunno'].keys(): categorised -= stats['dunno']['no_diff'] if total == 0: percentage = 0 print('the notes were not processed!') else: percentage = categorised * 100 / total stats['Categorised'] = '{} notes ({:02.2f}%)'.format( categorised, percentage) stats['Profiles'] = profile_cats total_stats.append('{}\n{}'.format(work_name, jp.encode(stats))) encoded = jp.encode(categorised_notes) if encoded != raw_template: categorised_notes['Stats'] = stats categorised_notes['profile'] = profiles categorised_notes['ngram_freq'] = frequencies write_file('output/{}_cats.json'.format(work_name), jp.encode(categorised_notes))
def create_base_text(raw_path): for f in os.listdir('output'): content = open_file('{}/{}_raw.txt'.format(raw_path, f)) # put back in a single line content = content.replace('\n', ' ') write_file('output/{}/{}_base.txt'.format(f, f), content)
def copy_final_version(final_path): for f in os.listdir('output'): if f+'_final.txt' in os.listdir('../4-a-final_formatting/output/3-3-final'): write_file('output/{}/{}'.format(f, f+'_final.txt'), open_file('{}/{}_final.txt'.format(final_path, f)))
def copy_cat_json_file(json_path): for f in os.listdir('output'): write_file('output/{}/{}'.format(f, f+'_cats.json'), open_file('{}/{}_cats.json'.format(json_path, f)))