示例#1
0
    def __init__(self):
        self.name = "stanfordnlp"
        with Timer() as self.model_load_time:
            from stanfordnlp import Pipeline, Document
            from stanfordnlp.models.common.conll import CoNLLFile

            self.pipeline = Pipeline(
                lang="de",
                tokenize_pretokenized=True,
                processors="depparse",
                # lower batch size so our GPU can cope
                depparse_batch_size=1000,
            )

            def myprocessor(myinput):
                # run input through converter to hide fields, etc.
                self.input_doc = common.Document(myinput,
                                                 hidden_fields=HIDDEN_FIELDS)
                modified_input = doc2string(self.input_doc)
                self.snlp_doc = Document("")
                self.snlp_doc.conll_file = CoNLLFile(input_str=modified_input)
                self.snlp_doc.load_annotations()
                return self.pipeline(self.snlp_doc)

            self.processor = myprocessor
示例#2
0
    def __init__(self):
        self.name = "stanfordnlp"
        with Timer() as self.model_load_time:
            from stanfordnlp import Pipeline

            self.processor = Pipeline(
                lang="de",
                tokenize_pretokenized=True,
                processors="tokenize,mwt,pos,lemma",
            )
示例#3
0
    def __init__(self):
        with Timer() as self.model_load_time:
            from stanfordnlp import Pipeline

            self.processor = Pipeline(
                lang="de",
                # 'mwt' processor would expand things
                # like 'am' to 'an dem'
                processors="tokenize",
            )
示例#4
0
 def __init__(self):
     # load stopwords
     self.stop_words = stopwords.words('english')
     # conjunctions to consider
     self.conjugation_tokens = ['and', 'or']
     # create lemmatizer and nlp-pipeline
     self.lemmatizer = WordNetLemmatizer()
     self.tokenizer = Pipeline(lang='en', processors="tokenize")
     self.nlp = Pipeline(lang='en',
                         processors="tokenize,pos,depparse",
                         tokenize_pretokenized=True)
     # map dep-type to function
     self.func_map = {
         'nsubj': nsubject,
         'det': det,
         'dep': dep,
         'dobj': dobj,
         'acomp': acomp,
         'amod': amod,
         'aux': aux,
         'nn': nn,
         'neg': neg,
         'prep': prep,
     }
示例#5
0
                        default='en')
    parser.add_argument(
        '-p',
        '--processors',
        help=
        'list of processors to run | default: "tokenize,mwt,pos,lemma,depparse"',
        default='tokenize,mwt,pos,lemma,depparse')
    parser.add_argument('text_file')
    args = parser.parse_args()
    # set output file path
    output_file_path = args.text_file + '.out'
    # map language code to treebank shorthand
    treebank_shorthand = default_treebanks[args.language]
    # check for models
    print('checking for models...')
    lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand)
    if not os.path.exists(lang_models_dir):
        print('could not find: ' + lang_models_dir)
        download(args.language, resource_dir=args.models_dir)
    # set up pipeline
    pipeline = Pipeline(processors=args.processors,
                        lang=args.language,
                        models_dir=args.models_dir)
    # build document
    print('running pipeline...')
    doc = pipeline(open(args.text_file).read())
    # write conll to file
    doc.write_conll_to_file(output_file_path)
    print('done.')
    print('results written to: ' + output_file_path)
示例#6
0
        output_file_path = args.text_file + '.out'
    else:
        output_file_path = args.output
    # map language code to treebank shorthand
    if args.treebank is not None:
        treebank_shorthand = args.treebank
    else:
        treebank_shorthand = default_treebanks[args.language]
    # check for models
    print('checking for models...')
    lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand)
    if not os.path.exists(lang_models_dir):
        print('could not find: ' + lang_models_dir)
        download(treebank_shorthand,
                 resource_dir=args.models_dir,
                 force=args.force_download)
    # set up pipeline
    pipeline_config = \
        dict([(k, v) for k, v in vars(args).items() if k in PROCESSOR_SETTINGS_LIST and v is not None])
    pipeline = Pipeline(processors=args.processors,
                        treebank=treebank_shorthand,
                        models_dir=args.models_dir,
                        **pipeline_config)
    # build document
    print('running pipeline...')
    doc = pipeline(open(args.text_file).read())
    # write conll to file
    doc.write_conll_to_file(output_file_path)
    print('done.')
    print('results written to: ' + output_file_path)
示例#7
0
#%% [markdown]
# # 第5章: 係り受け解析
# 夏目漱石の小説『吾輩は猫である』の文章(neko.txt)をCaboChaを使って係り受け解析し,その結果をneko.txt.cabochaというファイルに保存せよ.このファイルを用いて,以下の問に対応するプログラムを実装せよ.
#%% [markdown]
# `neko.txt`はdataディレクトリに格納されている。
# このファイルをstanfordnlpで構文解析し、その結果を`data/neko.txt.snd`に出力する
#%%
import time

from stanfordnlp import Pipeline

start = time.time()
nlp = Pipeline(lang='ja')
print(f'Init done. {time.time() - start} sec.')
#%%
start = time.time()
line_count = 0
sent_count = 0
with open('data/neko.txt',
          encoding='utf-8') as f, open('data/neko.txt.snd',
                                       'w',
                                       encoding='utf-8') as wf:
    for line in f:
        line_count = line_count + 1
        if line_count < 3:
            #本文は3行目から開始
            continue
        sentence = line.strip()
        if len(sentence) == 0:
            continue
        sent_count = sent_count + 1