Python BoPipeline.BoPipeline примеры использования

Язык программирования: Python

Пространство имен/Пакет: pybo

Класс/Тип: BoPipeline

Метод/Функция: BoPipeline

Примеров на hotexamples.com: 8

Python BoPipeline.BoPipeline - 8 примеров найдено. Это лучшие примеры Python кода для pybo.BoPipeline.BoPipeline, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

BoPipeline(8)

pipe_file(4)

pipe_str(4)

prof(1)

Пример #1

Показать файл

def test_add_custom_pipes():
    # create custom function
    def pybo_pos(tokens: List[PyboToken]) -> List[str]:
        """transforms the pybo tokens into word/POS format
        """

        return [f'{t["cleaned_content"]}/{t["pos"]}' for t in tokens]

    # create the pipe to be injected in the pipeline
    pipes = {'proc': {'pybo_pos': pybo_pos}}

    # create a profile using the new pipe to be injected
    # note: providing this profile this way is equal to writing it in pybo.yaml
    # and passing the name of the new profile as argument.
    profile = {
        'pybo_pos': {
            'pre': 'pre_basic',
            'tok': 'pybo',
            'pybo_profile': 'POS',
            'proc': 'pybo_pos',
            'frm': 'plaintext'
        }
    }

    pipeline = BoPipeline(profile=profile, new_pipes=pipes)

    result = pipeline.pipe_str(
        ' ཤི་བཀྲ་ཤིས་  tr བདེ་ལེགས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ x  བཀྲ་ཤིས་')
    expected = 'ཤི་/VERB བཀྲ་ཤིས་/NOUN /non-bo བདེ་ལེགས་/NOUN /punct བཀྲ་ཤིས་/NOUN ' \
               'བདེ་ལེགས་/NOUN ཀཀ་/non-word /non-bo བཀྲ་ཤིས་/NOUN'
    assert expected == result

Пример #2

Показать файл

def test_pipeline():
    pipeline = BoPipeline(profile='pybo_raw_types')
    pipeline.prof = 'POS'  # override the GMD profile from pybo_raw_types
    result = pipeline.pipe_str(
        ' ཤི་བཀྲ་ཤིས་  tr བདེ་ལེགས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ x  བཀྲ་ཤིས་')
    expected = """བཀྲ་ཤིས་	2
ཤི་	1
བཀྲ་ཤིས་ 	1
tr 	1
བདེ་ལེགས	1
། 	1
བདེ་ལེགས་	1
ཀཀ 	1
x 	1"""
    assert expected == result

Пример #3

Показать файл

    return sent_str


def format_output(sentences):
    mode = 'join'
    output = []
    for sent in sentences:
        if mode == 'join':
            sent_str = join_affixed_particles(sent[1])
        elif mode == 'mark':
            sent_str = add_affixed_marker(sent[1])
        else:
            raise SyntaxError('mode should either be "join" or "mark"')

        output.append((sent[0], sent_str))

    out = '\n'.join([f'{a[0]},{a[1]}' for a in output])
    return out


if __name__ == '__main__':
    pipeline = BoPipeline('dummy',
                          'pybo',
                          ('pybo_sentences', sentencify),
                          format_output,
                          pybo_profile='GMD')

    for f in Path('input').glob('*.txt'):
        pipeline.pipe_file(f, 'output')

Пример #4

Показать файл

        if t.type == 'syl':
            end = start + t.syls[-1][-1] + 1
            if t.affixed:
                pos = t.pos
            else:
                pos = t.pos
            output.append([pos, start, end])

        idx = start + len(t.content)
    return output


pipeline = BoPipeline(
    'dummy',  # preprocessor
    bo_tok,  # tokenizer
    pos_suggestions,  # processor
    'dummy',  # formatter
    pybo_profile='GMD')


def segment(string, tagset):
    suggestions = pipeline.pipe_str(string)
    for s in suggestions:
        try:
            s[0] = tagset[s[0]]
        except KeyError:
            s[0] = tagset['X']
    return suggestions


def generate_suggestions(examples, tagset):

Пример #5

Показать файл

        para_str = ''.join([token.content for token in para[1]])
        for key_type, keys in keys_dict.items():
            for key in keys:
                idx = para_str.find(key)
                if idx >= 0:
                    output.append({'ex': para_str, 'order': i, 'type': key_type, 'key': key, 'pos': idx})
                    break

    output = unique_objects(output, 'ex')
    return jp.dumps(output)


if __name__ == "__main__":
    pipeline = BoPipeline('dummy',                            # preprocessor
                          'pybo',                             # tokenizer
                          ('pybo_paragraphs', paragraphify),  # processor
                          filter_and_format_para,             # formatter
                          pybo_profile='GMD')                 # pybo_profile
   
    ap = argparse.ArgumentParser(add_help=False)                  
    ap.add_argument("--path", type=str, help="path to corpus")    
    args = ap.parse_args()                               
                                                              
    fns = sorted(glob.glob(os.path.join(args.path, "*")))         
    for fn in tqdm(fns):
        pipeline.pipe_file(fn, 'data/toupload')

    # convert all the ext file to json
    out_dir = 'data/toupload'
    out_fns = sorted(glob.glob(os.path.join(out_dir, "*")))
    for fn in out_fns:

Пример #6

Показать файл

        if t.affix:
            sent_str += ' ' + t.content
        else:
            sent_str += ' ' + t.content

    return sent_str


def format_to_csv(sentences):
    output = []
    for sent in sentences:
        sent_str = get_sent_str(sent[1])
        output.append((sent[0],sent_str))

    out = '\n'.join([f'{a[0]},{a[1]}' for a in output])
    return out


if __name__ == '__main__':
    pipeline = BoPipeline('dummy',
                          'pybo',
                          ('pybo_sentences', sentencify),
                          format_sentences_for_lighttag,
                          pybo_profile='GMD')

    #tokens = pipeline.pipe_file('lighttag/totag/mdzangs_blun.txt', 'lighttag/toupload')
    #in_str = 'འདི་ལ་ཡང་གཟུང་བའི་ཆ་ཡོད་ན་ཤེས་པ་ཡོད་ལ་མེད་ན་མེད་དེ། དེ་ལྟ་བས་ན་ལྷན་ཅིག་འབྱུང་བ་དེ་གཉིས་ཀྱང་རྒྱུ་དང་རྒྱུ་དང་ལྡན་པ་ཉིད་དུ་འགྲུབ་པོ། །གཏན་ཚིགས་པ་དག་ཅེས་བྱ་བ་ནི་གང་དག་གཏན་ཚིགས་ཀྱི་ཐ་སྙད་འདོགས་པ་དེ་དག་ནི་གཏན་ཚིགས་པ་དག་སྟེ། རྟོག་གེ་བ་ཞེས་བྱ་བའི་ཐ་ཚིག་གོ། །དེ་དང་ལྡན་པ་ཉིད་ཅེས་བྱ་བ་ནི་ཡོད་པ་དང་མེད་པ་དང་ལྡན་པ་ཉིད་དོ། །རྒྱུ་དང་རྒྱུ་དང་ལྡན་པ་ཞེས་བྱ་བ་ནི་རྒྱུ་དང་འབྲས་བུ་དག་ཅེས་བྱ་བའི་དོན་ཏོ། །'
    in_str = 'ལག་པ་གཡས་པ་བརྐྱང་ནས་འདི་སྐད་ཅེས་ཁོ་བོ་ཡུན་རིང་པོ་ནས་མངོན་པར་འདོད་པ་བུའི་ངོ་མཐོང་བ་དང་། ཁོ་བོ་དང་མཐུན་པར་སྐྱེ་ཞིང་མི་མཐུན་པར་མི་སྐྱེ་བ་དང་། ཁོ་བོའི་བྱ་བ་དག་བྱེད་པ་དང་། གསོས་པས་ཕྱིར་གསོ་བ་དང་། བགོ་སྐལ་ལ་སྤྱོད་པ་དང་། ཁོ་བོའི་རིགས་རྒྱུད་རིང་དུ་གནས་པར་སྒྲུབ་པ་དང་། བདག་ཅག་ཤི་ཞིང་དུས་ལ་བབ་པ་དག་ལ་ཉུང་ངུའམ། མང་པོ་ཡང་རུང་བའི་སྦྱིན་པ་དག་བྱིན་ཞིང་བསོད་ནམས་དག་བྱས་ནས་འདི་དེ་གཉིས་གང་དུ་སྐྱེས་ཤིང་འགྲོ་བའི་རྗེས་སུ་འགྲོ་བར་གྱུར་ཅིག་ཅེས་མིང་གིས་ཡོན་སྔོ་བ་བྱེད་པར་གྱུར་ཅིག་ཅེས་ཆེད་དུ་བརྗོད་པ་ཆེད་དུ་རྗོད་པར་བྱེད་དོ། །དེ་ལ་སེམས་ཅན་ཞུགས་པར་རིག་ནས*་འདི་ལྟ་སྟེ།'
    print(pipeline.pipe_str(in_str))
    print('ok')

Пример #7

Показать файл

Файл: SOAS_2_lighttag.py Проект: agutkin/ie-datasets

def lighttag_suggestion_pipeline():
    # pre: remove all \n and add space instead (keep_returns)
    # tok: syls
    # proc: ??? (process)
    # frm: json_maker
    return BoPipeline(keep_returns, 'syls', process, json_maker)

Пример #8

Показать файл

Файл: SOAS_2_lighttag.py Проект: agutkin/ie-datasets

def lighttag_base_pipeline():
    # pre: remove all \n
    # tok: reuse bo_syl_tok
    # proc: spaces_plain_fulltext
    # frm: plaintext
    return BoPipeline(basic_cleanup, 'syls', merge_spaces, lighttag_raw)