def test_sl_pretokenized_conllu():
    classla.download('sl', dir=TEST_MODELS_DIR)
    nlp = classla.Pipeline('sl', tokenize_pretokenized='conllu', dir=TEST_MODELS_DIR)
    conllu_pretokenized = """
# newpar id = 1
# sent_id = 1.1
# text = France Prešeren je rojen v Vrbi.
1	France	France	_	_	_	_	_	_	_
2	Prešeren	Prešeren	_	_	_	_	_	_	_
3	je	biti	_	_	_	_	_	_	_
4	rojen	rojen	_	_	_	_	_	_	_
5	v	v	_	_	_	_	_	_	_
6	Vrbi	Vrba	_	_	_	_	_	_	SpaceAfter=No
7	.	.	_	_	_	_	_	_	_

"""
    doc = nlp(conllu_pretokenized)
    assert doc.to_conll().strip() == SL_STANDARD_CONLL
 if args.output is None:
     output_file_path = args.text_file + '.out'
 else:
     output_file_path = args.output
 # map language code to treebank shorthand
 if args.treebank is not None:
     treebank_shorthand = args.treebank
 else:
     treebank_shorthand = default_treebanks[args.language]
 # check for models
 print('checking for models...')
 lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand)
 if not os.path.exists(lang_models_dir):
     print('could not find: ' + lang_models_dir)
     download(treebank_shorthand,
              resource_dir=args.models_dir,
              force=args.force_download)
 # set up pipeline
 pipeline_config = \
     dict([(k, v) for k, v in vars(args).items() if k in PROCESSOR_SETTINGS_LIST and v is not None])
 pipeline = Pipeline(processors=args.processors,
                     treebank=treebank_shorthand,
                     models_dir=args.models_dir,
                     **pipeline_config)
 # build document
 print('running pipeline...')
 doc = pipeline(open(args.text_file).read())
 # write conll to file
 doc.write_conll_to_file(output_file_path)
 print('done.')
 print('results written to: ' + output_file_path)
示例#3
0
    sentence_df = sentence_df[[
        'docId', 'sentenceId', 'tokenId', 'text', 'lemma', 'calcLemma', 'upos',
        'xpos', 'ner', 'clID'
    ]]  # leaving out 'misc' for now
    return sentence_df, warnings


if __name__ == '__main__':
    datasets_files = json.load(open('./data/results/dataset_pairs.json'))
    languages = set([
        lang for dataset in datasets_files
        for lang in datasets_files[dataset].keys()
    ])
    print(languages)
    processors = 'tokenize,pos,lemma'
    if DOWNLOAD_RESOURCES:  # do it once on a new system
        for lang in languages:
            lang = lang if lang != 'ua' else 'uk'
            print(f'Downloading {lang}...')
            stanza.download(lang, processors=processors)
        classla.download('sl')
        classla.download('bg')
    tokenizers = {
        lang: stanza.Pipeline(lang=lang if lang != 'ua' else 'uk',
                              processors=processors)
        for lang in languages
    }
    tokenizers['sl'] = classla.Pipeline('sl', processors=processors)
    tokenizers['bg'] = classla.Pipeline('bg', processors=processors)
    split_documents(datasets_files, tokenizers)
示例#4
0
    example_sentences = {
        "sl": "France Prešeren je rojen v Vrbi.",
        "hr": "Ante Starčević rođen je u Velikom Žitniku.",
        "sr": "Slobodan Jovanović rođen je u Novom Sadu.",
        "bg": "Алеко Константинов е роден в Свищов."
    }

    if args.lang not in example_sentences:
        print(
            f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}'
        )
        sys.exit(1)

    # download the models
    classla.download(args.lang, args.models_dir, confirm_if_exists=True)
    # set up a pipeline
    print('---')
    print('Building pipeline...')
    pipeline = classla.Pipeline(models_dir=args.models_dir,
                                lang=args.lang,
                                use_gpu=(not args.cpu))
    # process the document
    doc = pipeline(example_sentences[args.lang])
    # access nlp annotations
    print('')
    print('Input: {}'.format(example_sentences[args.lang]))
    print("The tokenizer split the input into {} sentences.".format(
        len(doc.sentences)))
    print('---')
    print('tokens of first sentence: ')
示例#5
0
    example_sentences = {
        "sl": "France Prešeren je rojen v Vrbi.",
        "hr": "Ante Starčević rođen je u Velikom Žitniku.",
        "sr": "Slobodan Jovanović rođen je u Novom Sadu.",
        "bg": "Алеко Константинов е роден в Свищов."
    }

    if args.lang not in example_sentences:
        print(
            f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}'
        )
        sys.exit(1)

    # download the models
    classla.download(args.lang, args.models_dir)
    # set up a pipeline
    print('---')
    print('Building pipeline...')
    pipeline = classla.Pipeline(dir=args.models_dir,
                                lang=args.lang,
                                use_gpu=(not args.cpu))
    # process the document
    doc = pipeline(example_sentences[args.lang])
    # access nlp annotations
    print('')
    print('Input: {}'.format(example_sentences[args.lang]))
    print("The tokenizer split the input into {} sentences.".format(
        len(doc.sentences)))
    print('---')
    print('tokens of first sentence: ')
def test_sl_inflectional():
    classla.download('sl', dir=TEST_MODELS_DIR)
    nlp = classla.Pipeline('sl', pos_use_lexicon=True, dir=TEST_MODELS_DIR)
    doc = nlp(SL_STANDARD)
    assert doc.to_conll().strip() == SL_STANDARD_CONLL
def test_sl_standard_jos():
    classla.download('sl', type='standard_jos', dir=TEST_MODELS_DIR)
    nlp = classla.Pipeline('sl', type='standard_jos', dir=TEST_MODELS_DIR)
    doc = nlp(SL_STANDARD_JOS)
    assert doc.to_conll().strip() == SL_STANDARD_JOS_CONLL
def test_mk_standard():
    classla.download('mk', dir=TEST_MODELS_DIR)
    nlp = classla.Pipeline('mk', dir=TEST_MODELS_DIR)
    doc = nlp(MK_STANDARD)
    assert doc.to_conll().strip() == MK_STANDARD_CONLL
def test_sr_nonstandard():
    classla.download('sr', type='nonstandard', dir=TEST_MODELS_DIR)
    nlp = classla.Pipeline('sr', type='nonstandard', dir=TEST_MODELS_DIR)
    doc = nlp(SR_NONSTANDARD)
    assert doc.to_conll().strip() == SR_NONSTANDARD_CONLL
示例#10
0
def test_all_downloads():
    classla.download('sl', dir=TEST_MODELS_DIR)
    classla.download('sl', type='standard_jos', dir=TEST_MODELS_DIR)
    classla.download('sl', type='nonstandard', dir=TEST_MODELS_DIR)
    classla.download('hr', dir=TEST_MODELS_DIR)
    classla.download('hr', type='nonstandard', dir=TEST_MODELS_DIR)
    classla.download('sr', dir=TEST_MODELS_DIR)
    classla.download('sr', type='nonstandard', dir=TEST_MODELS_DIR)
    classla.download('bg', dir=TEST_MODELS_DIR)
    classla.download('mk', dir=TEST_MODELS_DIR)