Exemplo n.º 1
0
 def __init__(self, parent):
     self.parent = parent
     self.lang = "bo"
     self.mode = "default"
     self.tagger = None
     self.tokenizer = None
     self._words = []
     
     if not self.tokenizer:
         self.tokenizer = pybo.BoTokenizer('POS')
Exemplo n.º 2
0
def check_pybo_bo_tokenizer(main):
    if 'pybo_bo_tokenizer' not in main.__dict__:
        main.pybo_bo_tokenizer = pybo.BoTokenizer('GMD')
Exemplo n.º 3
0
class ValidityError(Exception):
    '''Raised when a validity check is not passed'''


from collections import Counter
import pybo as bo

# 1. PREPARATION

# 1.1. Initializing the tokenizer
tok = bo.BoTokenizer('POS')

# 1.2. Loading in text
input_str = '༄༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར། བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། །སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །'

# -------------------------

# 2. CREATING THE OBJECTS

# 2.1. creating pre_processed object
pre_processed = bo.PyBoTextChunks(input_str)

# 2.2. creating tokens object
tokens = tok.tokenize(input_str)

# -------------------------

# 3. TESTING ALL CLASS OBJECT ATTRIBUTES
# (this needs to be checking accuracy and not just function)

# 3.1. testing pre processed attributes
Exemplo n.º 4
0
import sys, os

grandParentDir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
sys.path.append(grandParentDir)

from PyTib.common import open_file, write_file, tib_sort, is_sskrt, pre_process
import PyTib
import pybo
import re
from collections import defaultdict

tok = pybo.BoTokenizer('GMD')
# lex_path = '../PyTib/data/uncompound_lexicon.txt'
# lexicon = open_file(lex_path).strip().split('\n')
# lexicon = '\n'.join(tib_sort(list(set(lexicon))))
# write_file(lex_path, lexicon)


def rawify(string):
    return re.sub(r'[0-9]+\.\s?\s?', '',
                  string.replace('a', '').replace('\n', ''))


def contains_sskrt(string):
    string = string.replace('#', '')
    has_sskrt = False
    syls = pre_process(string, mode='syls')
    for syl in syls:
        if has_sskrt == False and is_sskrt(syl):
            has_sskrt = True
    return has_sskrt
Exemplo n.º 5
0
parser.add_argument('-o', required=True, help='folder to contain the output')
parser.add_argument('-c',
                    help='generates a conc file for each input file if "true"')

if __name__ == '__main__':
    # args = parser.parse_args()

    in_dir = 'out'  #args.i
    out_dir = 'segmented'  #args.o
    gen_concs = False  #bool(args.c)
    if not in_dir or not out_dir:
        parser.print_help()
        exit()

    pybo_mode = 'GMD'
    in_files = sorted(Path(in_dir).glob('*.txt'))
    out_dir = Path(out_dir)

    tok = pybo.BoTokenizer(
        pybo_mode)  # GMD includes all available wordlists + sanskrit

    concs, sorted_types = generate_mistake_concs(in_files)

    if gen_concs:
        write_file_concs(concs, sorted_types, in_files, out_dir)

    write_total_concs(concs, sorted_types, out_dir)

    types = find_total_types(concs, sep='\t')
    Path(out_dir / 'mistake_types.txt').write_text('\n'.join(types))
Exemplo n.º 6
0
def instanciate_tokenizer():
    vocab_file = Path('input/lists/full-vocab.txt')
    parse_vocab_folder(str(vocab_file))
    tok = pybo.BoTokenizer('POS', user_word_list=[str(vocab_file)])
    os.remove(str(vocab_file))
    return tok