Пример #1
0
def copy_derge_layout(derge_layout):
    for f in os.listdir('output'):
        no_layout = True
        if f in [a.replace('_raw_page_reinserted.txt', '') for a in os.listdir(derge_layout)]:
            content = open_file('{}/{}_raw_page_reinserted.txt'.format(derge_layout, f))
            reformated = re.sub(r'\n-+', '', content).replace('\\', '')
            write_file('output/{}/{}'.format(f, f+'_derge_layout.txt'), reformated)
            no_layout = False
        if no_layout and f in [a.replace('_with_a.txt', '') for a in os.listdir('../4-a-final_formatting/output/2-0-with_a')]:
            content = open_file('../4-a-final_formatting/output/2-0-with_a/{}_with_a.txt'.format(f))
            reformated = content.replace('\n', ' ').replace('a', '\n')
            write_file('output/{}/{}'.format(f, f + '_derge_lines.txt'), reformated)
Пример #2
0
def open_ngrams():
    ngrams = {}
    for i in range(1, 13):
        lines = open_file('./resources/kangyur_ngrams/{}-grams_raw.txt'.format(
            i)).strip().split('\n')
        for line in lines:
            parts = line.split(' ')
            text = ''.join(parts[:-1]).strip().strip('་')
            ngrams[text] = int(parts[-1])
    return ngrams
Пример #3
0
def raw_folder_content(path, num=10):
    files_content = {}
    for f in os.listdir(path):
        full_path = '{}/{}'.format(path, f)
        # open file
        raw = open_file(full_path)
        if num != 0:
            raw = first_parts(raw, num)
        files_content[full_path] = raw
    return files_content
Пример #4
0
def generate_outputs(text_name,
                     notes_name,
                     context,
                     in_dir=inDir,
                     out_dir=outDir):

    # extract text and reinsert notes
    editions = reinsert_notes(open_file(in_dir / text_name),
                              open_file(in_dir / notes_name).replace(';', ','))

    work_name = text_name.split('.')[0].replace(' ', '_')
    print(work_name)

    generate_editions(editions, out_dir, work_name)

    export_unified_structure(editions, work_name)

    generate_context_versions(editions,
                              work_name,
                              out_dir,
                              left=context,
                              right=context)
Пример #5
0
def reconstruct_version_texts(in_path):
    for f in os.listdir(in_path):
        text_name = f.strip('_updated_structure.txt')
        # create a folder for the layered files if missing
        if text_name in os.listdir('output'):
            current_out_folder = 'output/' + text_name
            # open structure file
            from_structure = yaml.load(open_file('{}/{}'.format(in_path, f)))
            # reconstruct the editions
            editions = reconstruct_edition_versions(from_structure)
            # write them in the corresponding folder
            for ed, version in editions.items():
                version = version.replace('_', ' ')  # reconstruct spaces
                write_file('{}/{}_{}_layer.txt'.format(current_out_folder, text_name, ed), version)
Пример #6
0
def process(in_path, file_origin, name_end, out_path):
    for f in os.listdir(in_path):
        work_name = f.replace(name_end, '')
        # raw_content = open_file(file_origin.format(work_name.replace('_', ' ')))
        try:
            raw_content = open_file(file_origin.format(work_name))
        except FileNotFoundError:
            continue

        content = re.sub(r'\n?[0-9]+\.\s+', '', raw_content)
        content = re.sub(r' ', '\n', content)
        write_file(out_path[0].format(work_name), content)

        content = content.replace('a', '')
        write_file(out_path[1].format(work_name), content)
Пример #7
0
def find_string(in_path, string):
    """wrapper function """
    found = []
    for f in os.listdir(in_path):
        raw = open_file('{}/{}'.format(in_path, f))
        raw = clean_string(raw, strip=True, del_returns=True)

        # unsegment the files
        raw = clean_string(raw, del_dashes=True, del_spaces=True)

        if string in raw:
            found.append(f)

    if found == []:
        return 'Nothing found.\n'
    else:
        return 'The following files were found:\n{}\n'.format('\n'.join(found))
Пример #8
0
def reinsert_raw(in_path, out_path, patterns):
    print('raw reinsertion')
    for f in os.listdir(in_path):
        work_name = f.replace('_with_a.txt', '')
        print(work_name)
        if work_name in patterns:
            print('\t', work_name)
            content = open_file('{}/{}'.format(in_path, f))
            lines = deque(content.replace('\n', ' ').split('a'))

            pages = []
            text_pattern = patterns[work_name][2:]
            counter = patterns[work_name][0][1]
            side = patterns[work_name][0][2]

            # beginning pages
            for num in text_pattern[0]:
                pages.append(create_page(lines, num, counter, side))
                counter, side = increment_counter(counter, side)

            # body of the text
            while len(lines) > 0:
                if len(lines) >= text_pattern[1]:
                    pages.append(
                        create_page(lines, text_pattern[1], counter, side))
                    counter, side = increment_counter(counter, side)
                elif text_pattern[2] == len(lines):
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)
                else:
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)

            output = '\n{}\n'.format('-' * 100).join(pages)

            write_file(
                '{}/{}_raw_page_reinserted.txt'.format(out_path, work_name),
                output)
            print('2-2!')
Пример #9
0
            else:
                out.append(t.content)
        else:
            out.append(t.content.replace(' ', '_'))
    return ' '.join(out)


in_path = 'out'
out_path = 'segmented'
# populate total with the mistakes of all files in in_path
total = defaultdict(list)
for f in sorted(os.listdir(in_path)):
    if f.endswith('txt'):
        work_name = f.replace('.txt', '')
        print(work_name)
        content = open_file('{}/{}'.format(in_path, f))
        content = rawify(content)
        # segmented = PyTib.Segment().segment(content)
        pybo_segmented = pybo_segment(content)
        mistakes = mistake_conc(pybo_segmented, work_name)
        for k, v in mistakes.items():
            if not contains_sskrt(k):
                total[k].extend(v)

# write individual files for each text, presenting the mistakes in total frequency order
len_ordered_mistakes = sorted(total, key=lambda x: len(total[x]), reverse=True)
for f in os.listdir(in_path):
    if f.endswith('txt'):
        current_text = f.replace('.txt', '')
        # filter mistakes of the current file
        output = []

reviewed_path = '../3-b-reviewed_texts'
structure_path = '../3-a-revision_format/output/updated_structure'
limit = False
for f in sorted(os.listdir(reviewed_path)):
    print(f)
    if f:  # == '1-1_ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ།_DUCKed.csv':
        work_name = f.replace('_DUCKed.csv', '')
        if work_name == 'N3118':
            limit = True

        if not limit:
            continue
        note_choice = parse_decisions(
            open_file('{}/{}'.format(reviewed_path, f)))

        # parse the file to keep only the decision and the note number
        try:
            updated_structure = yaml.load(
                open_file('{}/{}_updated_structure.txt'.format(
                    structure_path, work_name)))
        except FileNotFoundError:
            continue
        try:
            unified_structure = yaml.load(
                open_file(
                    '../1-a-reinsert_notes/output/unified_structure/{}_unified_structure.yaml'
                    .format(work_name)))
        except FileNotFoundError:
            continue
Пример #11
0
 
import os
import re
from PyTib.common import open_file, write_file

missing_space = r'^([^ -])'
missing_space_rpl = r' \1'

missing_tsek = r'(ཅོ|སྣར|སྡེ|པེ)(?=):'
missing_tsek_rpl = r'\1་:'

files = [a for a in os.listdir('.') if a != 'conc_sanity_check.py']

for f in files:
    print(f)
    raw = open_file('./' + f)
    raw = re.sub(missing_space, missing_space_rpl, raw)
    raw = re.sub(missing_tsek, missing_tsek_rpl, raw)
    raw = raw.replace('-1-,,,,,,,,,,,,,,,', '-1-,,,,,,,,,,,,,,,')
    write_file('./' + f, raw)
    lines = raw.split('\n')
    for num, line in enumerate(lines):
        toprint = False
        if line.startswith('-'):
            pass
        elif line.startswith(r' ཅོ་:'):
            pass
        elif line.startswith(r' སྣར་:'):
            pass
        elif line.startswith(' སྡེ་:'):
            pass
import sys, os
from PyTib.common import open_file
import os

bad = []
for f in os.listdir('./input'):
    if f.endswith('Csv'):
        content = open_file('./input/' + f)
        if content.startswith(
                'མཚན་བྱང་།,པར་གྲངས།,ཤོག་གྲངས།,1,མཚན་གྲངས།,པར་མ།,མཆན།'
        ) or content.startswith(
                'མཚན་བྱང་།,པར་གྲངས།,ཤོག་གྲངས།,1,མཆན་གྲངས།,པར་མ།,མཆན།'):
            bad.append(f)
for b in sorted(bad):
    print(b)
print(f'No. of Bad files: {len(bad)}')
Пример #13
0
def reinsert(in_path, out_path1, out_path2, patterns):
    print('reinsertion with notes')
    for f in os.listdir(in_path):
        work_name = f.replace('_a_reinserted.txt', '')
        if work_name in patterns:
            print('\t', work_name)
            content = open_file('{}/{}'.format(in_path, f))
            if not re.findall(r'\n\[\^[0-9A-Z]+\]\:', content):
                text = content
                notes = ''
            else:
                text, notes = [
                    a for a in re.split(
                        r'((?:\n?\[\^[0-9A-Z]+\]\:[^\n]+\n?)+)', content)
                    if a != ''
                ]
            lines = deque(text.replace('\n', ' ').split('a'))

            pages = []
            text_pattern = patterns[work_name][2:]
            counter = patterns[work_name][0][1]
            side = patterns[work_name][0][2]

            # beginning pages
            for num in text_pattern[0]:
                pages.append(create_page(lines, num, counter, side))
                counter, side = increment_counter(counter, side)

            # body of the text
            while len(lines) > 0:
                if len(lines) >= text_pattern[1]:
                    pages.append(
                        create_page(lines, text_pattern[1], counter, side))
                    counter, side = increment_counter(counter, side)
                elif text_pattern[2] == len(lines):
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)
                else:
                    print(
                        'There is a line number issue: only {} lines were left for the last page.'
                        .format(len(lines)))
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)

            output = '\n{}\n'.format('-' * 100).join(pages) + '\n\n' + notes

            write_file(
                '{}/{}_page_reinserted.txt'.format(out_path1, work_name),
                output)

            # write to the file to 3-2-compared if it is not yet there
            existing = [
                g.replace('_compared.txt', '') for g in os.listdir(out_path2)
                if g.endswith('.txt')
            ]
            #if work_name not in existing:
            write_file('{}/{}_compared.txt'.format(out_path2, work_name),
                       output)
            text_path = '{}/extra_copies/{}'.format(out_path2, work_name)
            if not os.path.exists(text_path):
                os.makedirs(text_path)
Пример #14
0
in_path = './output/2-1-a_reinserted'
out_path1 = './output/3-1-page_reinserted'
out_path2 = './output/3-2-compared'
raw_in_path = './output/2-0-with_a'
raw_out_path = './output/2-2-raw_page_reinserted'

# '': [
#     ('start', 0, ''),     # page start + front/back
#     ('end', 0, ''),       # idem
#     [0],                  # list of lines per page for the beginning of the text
#     0,                    # general number of lines per page
#     0                     # number of lines pertaining to the current text on the last page
# ]

patterns_raw = open_file(
    'C:/Users/trinley/github/canon_notes/4-a-final_formatting/resources/དཀར་ཆག་ཀུན་གསལ་མེ་ལོང་།.csv'
).strip().split('\n')
patterns = {}
for line in patterns_raw[1:]:
    parts = line.split('\t')
    names = patterns_raw[0].split('\t')
    # 'ཆོས་ཚན།'
    title = re.sub(r'_conc.*', '', parts[1])
    # 'དབུ།'
    b_page = int(parts[2])
    sides = {'a': 'ན', 'b': 'བ'}
    # 'རྒྱབ་འདུན།' (of 'དབུ།')
    b_side = sides[parts[3]]
    # 'ཞབས།'
    e_page = int(parts[6])
    # 'རྒྱབ་འདུན།' (of 'ཞབས།')
Пример #15
0
import PyTib
from PyTib.common import non_tib_chars, open_file, write_file, tib_sort, bisect_left
import os
import re
in_path = 'input/'
out_path = 'output/'

new = []
for f in os.listdir(in_path):
    content = open_file(in_path+f).replace('༌', '་').split('\n')
    content = [a.strip() for a in content if a != '']
    # find all non-tibetan characters
    to_delete = []
    for c in content:
        for char in c:
            if char not in to_delete and non_tib_chars(char):
                to_delete.append(char)
    # add punctuation to be deleted
    to_delete.extend(['།', '༎', '༄', '༅', '༑'])

    # replace them with spaces
    text = []
    for r in range(len(content)-1):
        line = content[r]
        for t in to_delete:
            line = line.replace(t, ' ')
        text.append(re.sub(r'\s+', r' ', line))

    lexicon = []
    for t in text:
        lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != ''])
Пример #16
0
def processing_corpus(corpus_path):
    print('processing corpus')
    # variables for the main xslx
    corpus_frequencies = {}
    corpus_total_frequency = defaultdict(int)
    corpus_origin = defaultdict(list)

    # variables for individual xslx files
    persons_frequencies = {}
    persons_total_frequency = {}
    persons_origin = {}

    for f in os.listdir(corpus_path):
        # A. finding the names for the current file
        file_name = f
        # finding the name of the section
        section = ''
        for c in corpus_sections:
            if c in f:
                section = c
                file_name = file_name.replace(c, '')
        if section == '':
            section = 'ཁུངས་མེད།'
        # finding the name of the person
        person = re.sub(r'[0-9]* *\([0-9]+\).txt', '', file_name).strip()

        # B. initiating the entries in dicts if missing for the current file
        # creating an entry in corpus_frequencies{} if it does not exist
        if section not in corpus_frequencies.keys():
            corpus_frequencies[section] = defaultdict(int)

        # creating an entry in persons_frequencies{} if it does not exist
        if person not in persons_frequencies.keys():
            persons_frequencies[person] = {}
        if section not in persons_frequencies[person].keys():
            persons_frequencies[person][section] = defaultdict(int)

        # creating an entry in persons_total_frequency if it does not exist
        if person not in persons_total_frequency.keys():
            persons_total_frequency[person] = defaultdict(int)

        # creating an entry in persons_origin{} if it does not exist
        if person not in persons_origin.keys():
            persons_origin[person] = defaultdict(list)

        # C. processing the current file
        content = open_file(corpus_path+'/'+f).replace('༌', '་').split('\n')
        content = [a.strip() for a in content if a != '']

        # find all non-tibetan characters
        to_delete = chars_to_delete(content)
        # replace them with spaces
        text = []
        for r in range(len(content)):
            line = content[r]
            for t in to_delete:
                line = line.replace(t, ' ')
            text.append(re.sub(r'\s+', r' ', line).strip())

        # D. filling in the corpus and personal variables
        # split the line in words and add it to the persons_frequencies and to the newspaper dict
        for t in text:
            split_line = [u.rstrip('་')+'་' for u in t.split(' ') if u.rstrip('་') != '']
            for word in split_line:
                clean_word = word.lstrip('་')
                # corpus stats
                corpus_frequencies[section][clean_word] += 1
                corpus_total_frequency[clean_word] += 1
                if f not in corpus_origin[clean_word]:
                    corpus_origin[clean_word].append(f)

                # persons’ stats
                persons_frequencies[person][section][clean_word] += 1
                persons_total_frequency[person][clean_word] += 1
                if f not in persons_origin[person][clean_word]:
                    persons_origin[person][clean_word].append(f)

    return corpus_frequencies, corpus_total_frequency, corpus_origin, persons_frequencies, persons_total_frequency, persons_origin
Пример #17
0
def process(in_path, template_path, total_stats):
    global collection_eds, file, debug
    raw_template = open_file(template_path)
    verbs = jp.decode(open_file('./resources/monlam_verbs.json'))
    all_ngrams = open_ngrams()
    files = find_file_path(in_path, '../1-a-reinsert_notes/output/conc_yaml')
    # print(files)
    for filename in files:
        # if 'N5000' not in filename:
        #     continue
        f = filename.split('/')[-1]
        print(f)
        if debug and f != file:
            continue
        work_name = f.replace('_conc.txt', '').replace('.txt', '')

        raw = open_file(filename)
        # setting collection_eds for the current file
        collection_eds = list({a for a in re.findall(r' ([^ ]+): ', raw)})
        if len(collection_eds) > 4:
            print(collection_eds)
        data = prepare_data(raw)
        profiles, profile_cats = find_profiles(data)

        # prepare
        prepared = find_all_parts(data)

        # categorise
        categorised_notes = jp.decode(raw_template)

        # find ngram frequencies
        frequencies = ngram_frequency(prepared, all_ngrams)

        if debug:
            if file == f and note_num != 0:
                for note in prepared:
                    if note[0] == note_num:
                        categorise(note, categorised_notes, verbs)
            elif file == f:
                for note in prepared:
                    categorise(note, categorised_notes, verbs)
        else:
            for note in prepared:
                categorise(note, categorised_notes, verbs)

        # finally write the json file
        stats = {}
        total = 0
        for key1, item1 in sorted(categorised_notes.items()):
            if type(item1) == list:
                if len(item1) != 0:
                    stats[key1] = len(item1)
                    total += len(item1)
            else:
                stats[key1] = {}
                for key2, item2 in sorted(item1.items()):
                    if type(item2) == list:
                        if len(item2) != 0:
                            stats[key1][key2] = len(item2)
                            total += len(item2)
                    else:
                        stats[key1][key2] = {}
                        for key3, item3 in sorted(item2.items()):
                            if type(item3) == list:
                                if len(item3) != 0:
                                    stats[key1][key2][key3] = len(item3)
                                    total += len(item3)
                            else:
                                stats[key1][key2][key3] = {}
                                for key4, item4 in sorted(item3.items()):
                                    if type(item4) == list:
                                        if len(item4) != 0:
                                            stats[key1][key2][key3][
                                                key4] = len(item4)
                                            total += len(item4)
        stats['Notes’ total'] = total
        categorised = total
        if 'long_diff' in stats['dunno'].keys():
            categorised -= stats['dunno']['long_diff']
        if 'short_diff' in stats['dunno'].keys():
            categorised -= stats['dunno']['short_diff']
        if 'no_diff' in stats['dunno'].keys():
            categorised -= stats['dunno']['no_diff']
        if total == 0:
            percentage = 0
            print('the notes were not processed!')
        else:
            percentage = categorised * 100 / total
        stats['Categorised'] = '{} notes ({:02.2f}%)'.format(
            categorised, percentage)
        stats['Profiles'] = profile_cats
        total_stats.append('{}\n{}'.format(work_name, jp.encode(stats)))

        encoded = jp.encode(categorised_notes)
        if encoded != raw_template:
            categorised_notes['Stats'] = stats
            categorised_notes['profile'] = profiles
            categorised_notes['ngram_freq'] = frequencies
            write_file('output/{}_cats.json'.format(work_name),
                       jp.encode(categorised_notes))
Пример #18
0
def create_base_text(raw_path):
    for f in os.listdir('output'):
        content = open_file('{}/{}_raw.txt'.format(raw_path, f))
        # put back in a single line
        content = content.replace('\n', ' ')
        write_file('output/{}/{}_base.txt'.format(f, f), content)
Пример #19
0
def copy_final_version(final_path):
    for f in os.listdir('output'):
        if f+'_final.txt' in os.listdir('../4-a-final_formatting/output/3-3-final'):
            write_file('output/{}/{}'.format(f, f+'_final.txt'), open_file('{}/{}_final.txt'.format(final_path, f)))
Пример #20
0
def copy_cat_json_file(json_path):
    for f in os.listdir('output'):
        write_file('output/{}/{}'.format(f, f+'_cats.json'), open_file('{}/{}_cats.json'.format(json_path, f)))