def __init__(self): self.name = "stanfordnlp" with Timer() as self.model_load_time: from stanfordnlp import Pipeline, Document from stanfordnlp.models.common.conll import CoNLLFile self.pipeline = Pipeline( lang="de", tokenize_pretokenized=True, processors="depparse", # lower batch size so our GPU can cope depparse_batch_size=1000, ) def myprocessor(myinput): # run input through converter to hide fields, etc. self.input_doc = common.Document(myinput, hidden_fields=HIDDEN_FIELDS) modified_input = doc2string(self.input_doc) self.snlp_doc = Document("") self.snlp_doc.conll_file = CoNLLFile(input_str=modified_input) self.snlp_doc.load_annotations() return self.pipeline(self.snlp_doc) self.processor = myprocessor
def __init__(self): self.name = "stanfordnlp" with Timer() as self.model_load_time: from stanfordnlp import Pipeline self.processor = Pipeline( lang="de", tokenize_pretokenized=True, processors="tokenize,mwt,pos,lemma", )
def __init__(self): with Timer() as self.model_load_time: from stanfordnlp import Pipeline self.processor = Pipeline( lang="de", # 'mwt' processor would expand things # like 'am' to 'an dem' processors="tokenize", )
def __init__(self): # load stopwords self.stop_words = stopwords.words('english') # conjunctions to consider self.conjugation_tokens = ['and', 'or'] # create lemmatizer and nlp-pipeline self.lemmatizer = WordNetLemmatizer() self.tokenizer = Pipeline(lang='en', processors="tokenize") self.nlp = Pipeline(lang='en', processors="tokenize,pos,depparse", tokenize_pretokenized=True) # map dep-type to function self.func_map = { 'nsubj': nsubject, 'det': det, 'dep': dep, 'dobj': dobj, 'acomp': acomp, 'amod': amod, 'aux': aux, 'nn': nn, 'neg': neg, 'prep': prep, }
default='en') parser.add_argument( '-p', '--processors', help= 'list of processors to run | default: "tokenize,mwt,pos,lemma,depparse"', default='tokenize,mwt,pos,lemma,depparse') parser.add_argument('text_file') args = parser.parse_args() # set output file path output_file_path = args.text_file + '.out' # map language code to treebank shorthand treebank_shorthand = default_treebanks[args.language] # check for models print('checking for models...') lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand) if not os.path.exists(lang_models_dir): print('could not find: ' + lang_models_dir) download(args.language, resource_dir=args.models_dir) # set up pipeline pipeline = Pipeline(processors=args.processors, lang=args.language, models_dir=args.models_dir) # build document print('running pipeline...') doc = pipeline(open(args.text_file).read()) # write conll to file doc.write_conll_to_file(output_file_path) print('done.') print('results written to: ' + output_file_path)
output_file_path = args.text_file + '.out' else: output_file_path = args.output # map language code to treebank shorthand if args.treebank is not None: treebank_shorthand = args.treebank else: treebank_shorthand = default_treebanks[args.language] # check for models print('checking for models...') lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand) if not os.path.exists(lang_models_dir): print('could not find: ' + lang_models_dir) download(treebank_shorthand, resource_dir=args.models_dir, force=args.force_download) # set up pipeline pipeline_config = \ dict([(k, v) for k, v in vars(args).items() if k in PROCESSOR_SETTINGS_LIST and v is not None]) pipeline = Pipeline(processors=args.processors, treebank=treebank_shorthand, models_dir=args.models_dir, **pipeline_config) # build document print('running pipeline...') doc = pipeline(open(args.text_file).read()) # write conll to file doc.write_conll_to_file(output_file_path) print('done.') print('results written to: ' + output_file_path)
#%% [markdown] # # 第5章: 係り受け解析 # 夏目漱石の小説『吾輩は猫である』の文章(neko.txt)をCaboChaを使って係り受け解析し,その結果をneko.txt.cabochaというファイルに保存せよ.このファイルを用いて,以下の問に対応するプログラムを実装せよ. #%% [markdown] # `neko.txt`はdataディレクトリに格納されている。 # このファイルをstanfordnlpで構文解析し、その結果を`data/neko.txt.snd`に出力する #%% import time from stanfordnlp import Pipeline start = time.time() nlp = Pipeline(lang='ja') print(f'Init done. {time.time() - start} sec.') #%% start = time.time() line_count = 0 sent_count = 0 with open('data/neko.txt', encoding='utf-8') as f, open('data/neko.txt.snd', 'w', encoding='utf-8') as wf: for line in f: line_count = line_count + 1 if line_count < 3: #本文は3行目から開始 continue sentence = line.strip() if len(sentence) == 0: continue sent_count = sent_count + 1