def getTextStihi(textlink): r = requests.get(textlink, headers=headers) text = re.split("</div>", re.split('<div class="text">', r.text)[1])[0] # "Откусываем" оставшиеся теги. beaux_text = BeautifulSoup(text, "lxml") n_text = beaux_text.get_text() n_text = re.sub('\xa0', '', n_text) n_text = unify.unify_sym(n_text) return (n_text)
# coding: utf-8 # In[1]: import os import re import unify path = r'/home/tsha/Taiga/social/vk/texts/vktexts.txt' path200 = r'/home/tsha/Taiga/social/vk/text200.txt' taggedpath = r'/home/tsha/Taiga/social/vk/texts_tagged/vktexts.conll' file = open(path, 'r', encoding='utf8') text200 = unify.unify_sym(file.read()) file200 = open(path200, 'w', encoding='utf8') file200.write(text200) file.close() file200.close() os.system( '/home/tsha/udpipe/src/udpipe --tokenize /home/tsha/models/syntagrus-default.udpipe --tokenizer="normalized_spaces" ' + path200 + ' --outfile=' + taggedpath)
taggedpath = r'/home/tsha/Taiga/Fontanka/texts_tagged' for fold in folders[0]: os.mkdir(os.path.join(taggedpath, fold)) steps = [i for i in range(len(fl) // step)] steps += [steps[-1] + 1] for i in steps: if i != steps[-1]: filenames = fl[i * step:i * step + step] else: filenames = fl[i * step:i * step + len(fl) % step] text200 = [] id200 = [] for f in filenames: fpath = os.path.join(path, f) file = open(fpath, 'r', encoding='utf8') text200.append(unify.unify_sym(file.read())) id200.append(f) file.close() outfile = open(os.path.join(path200, str(i) + '.txt'), 'w', encoding='utf8') outfile.write("\n\n++++\n\n".join(text200)) outfile.close() os.system( '/home/tsha/udpipe/src/udpipe --tokenize /home/tsha/models/syntagrus-default.udpipe --tokenizer="normalized_spaces" --tag --parse ' + os.path.join(path200, str(i) + '.txt') + ' --outfile=' + os.path.join(path200, str(i) + '.conll')) outfile = open(os.path.join(path200,
return conlluOutput.writeSentence(sentence) model = ufal.udpipe.Model.load('/home/tsha/models/syntagrus-default.udpipe') tokenizer = model.newTokenizer(model.DEFAULT) conlluOutput = ufal.udpipe.OutputFormat.newOutputFormat("conllu") sentence = ufal.udpipe.Sentence() error = ufal.udpipe.ProcessingError() WDIR = r'/home/tsha/stihi_ru' wallpath = os.path.join(WDIR, 'texts') taggedpath = ensure_dir(os.path.join(WDIR, 'tagged_texts')) for path, subdirs, files in tqdm(os.walk(wallpath)): for name in files: file = os.path.join(path, name) if r".txt" in file: print(file) f = open(file, 'r', encoding='utf8').read() newpath = ensure_dir(re.sub('texts', 'tagged_texts', path)) print(os.path.join(newpath, name)) out = open(os.path.join(newpath, name), 'w', encoding='utf8') text = unify.unify_sym(f) slist = sent_tokenize(text) for s in slist: s = tag(s, model) out.write(s) out.close()
os.mkdir(os.path.join(taggedpath, fold)) steps = [i for i in range(len(fl) // step)] steps += [steps[-1] + 1] for i in steps: if i != steps[-1]: filenames = fl[i * step:i * step + step] else: filenames = fl[i * step:i * step + len(fl) % step] text200 = [] id200 = [] for f in filenames: fpath = os.path.join(path, f) file = open(fpath, 'r', encoding='utf8') texts = file.readlines() texts = "\n".join([t.split('\t')[3] for t in texts]) text200.append(unify.unify_sym(texts)) id200.append(f) file.close() outfile = open(os.path.join(path200, str(i) + '.txt'), 'w', encoding='utf8') outfile.write("\n\n++++\n\n".join(text200)) outfile.close() os.system( '/home/tsha/udpipe/src/udpipe --tokenize /home/tsha/models/syntagrus-default.udpipe --tokenizer="normalized_spaces" --tag --parse ' + os.path.join(path200, str(i) + '.txt') + ' --outfile=' + os.path.join(path200, str(i) + '.conll')) outfile = open(os.path.join(path200,