示例#1
0
def getTextStihi(textlink):
    r = requests.get(textlink, headers=headers)
    text = re.split("</div>", re.split('<div class="text">', r.text)[1])[0]
    # "Откусываем" оставшиеся теги.
    beaux_text = BeautifulSoup(text, "lxml")
    n_text = beaux_text.get_text()
    n_text = re.sub('\xa0', '', n_text)
    n_text = unify.unify_sym(n_text)
    return (n_text)
# coding: utf-8

# In[1]:

import os
import re
import unify

path = r'/home/tsha/Taiga/social/vk/texts/vktexts.txt'
path200 = r'/home/tsha/Taiga/social/vk/text200.txt'
taggedpath = r'/home/tsha/Taiga/social/vk/texts_tagged/vktexts.conll'

file = open(path, 'r', encoding='utf8')
text200 = unify.unify_sym(file.read())
file200 = open(path200, 'w', encoding='utf8')
file200.write(text200)
file.close()
file200.close()

os.system(
    '/home/tsha/udpipe/src/udpipe --tokenize /home/tsha/models/syntagrus-default.udpipe  --tokenizer="normalized_spaces" '
    + path200 + ' --outfile=' + taggedpath)
示例#3
0
taggedpath = r'/home/tsha/Taiga/Fontanka/texts_tagged'
for fold in folders[0]:
    os.mkdir(os.path.join(taggedpath, fold))
steps = [i for i in range(len(fl) // step)]
steps += [steps[-1] + 1]
for i in steps:
    if i != steps[-1]:
        filenames = fl[i * step:i * step + step]
    else:
        filenames = fl[i * step:i * step + len(fl) % step]
    text200 = []
    id200 = []
    for f in filenames:
        fpath = os.path.join(path, f)
        file = open(fpath, 'r', encoding='utf8')
        text200.append(unify.unify_sym(file.read()))
        id200.append(f)
        file.close()
    outfile = open(os.path.join(path200,
                                str(i) + '.txt'),
                   'w',
                   encoding='utf8')
    outfile.write("\n\n++++\n\n".join(text200))
    outfile.close()
    os.system(
        '/home/tsha/udpipe/src/udpipe --tokenize /home/tsha/models/syntagrus-default.udpipe  --tokenizer="normalized_spaces" --tag --parse '
        + os.path.join(path200,
                       str(i) + '.txt') + ' --outfile=' +
        os.path.join(path200,
                     str(i) + '.conll'))
    outfile = open(os.path.join(path200,
示例#4
0
    return conlluOutput.writeSentence(sentence)


model = ufal.udpipe.Model.load('/home/tsha/models/syntagrus-default.udpipe')

tokenizer = model.newTokenizer(model.DEFAULT)
conlluOutput = ufal.udpipe.OutputFormat.newOutputFormat("conllu")
sentence = ufal.udpipe.Sentence()
error = ufal.udpipe.ProcessingError()

WDIR = r'/home/tsha/stihi_ru'
wallpath = os.path.join(WDIR, 'texts')
taggedpath = ensure_dir(os.path.join(WDIR, 'tagged_texts'))

for path, subdirs, files in tqdm(os.walk(wallpath)):
    for name in files:
        file = os.path.join(path, name)

        if r".txt" in file:
            print(file)
            f = open(file, 'r', encoding='utf8').read()
            newpath = ensure_dir(re.sub('texts', 'tagged_texts', path))
            print(os.path.join(newpath, name))
            out = open(os.path.join(newpath, name), 'w', encoding='utf8')
            text = unify.unify_sym(f)
            slist = sent_tokenize(text)
            for s in slist:
                s = tag(s, model)
                out.write(s)
            out.close()
示例#5
0
    os.mkdir(os.path.join(taggedpath, fold))
steps = [i for i in range(len(fl) // step)]
steps += [steps[-1] + 1]
for i in steps:
    if i != steps[-1]:
        filenames = fl[i * step:i * step + step]
    else:
        filenames = fl[i * step:i * step + len(fl) % step]
    text200 = []
    id200 = []
    for f in filenames:
        fpath = os.path.join(path, f)
        file = open(fpath, 'r', encoding='utf8')
        texts = file.readlines()
        texts = "\n".join([t.split('\t')[3] for t in texts])
        text200.append(unify.unify_sym(texts))
        id200.append(f)
        file.close()
    outfile = open(os.path.join(path200,
                                str(i) + '.txt'),
                   'w',
                   encoding='utf8')
    outfile.write("\n\n++++\n\n".join(text200))
    outfile.close()
    os.system(
        '/home/tsha/udpipe/src/udpipe --tokenize /home/tsha/models/syntagrus-default.udpipe  --tokenizer="normalized_spaces" --tag --parse '
        + os.path.join(path200,
                       str(i) + '.txt') + ' --outfile=' +
        os.path.join(path200,
                     str(i) + '.conll'))
    outfile = open(os.path.join(path200,