示例#1
0
 def assemble_text(doc, ordered=None):
     res, isli = None, None
     if isinstance(doc, list):
         lines, li_no = [], 1
         for line in doc:
             line = assemble_text(line)
             if line:
                 if isinstance(line, tuple):
                     line = line[0]
                     if line:
                         if ordered:
                             prefix = str(li_no) + '.'
                             li_no += 1
                         else:
                             prefix = '-'
                         line = prefix + ' ' + line
                 if line:
                     lines.append(line)
         res, need_space = '', False
         for line in lines:
             need_space_ = line[-1] != '\n'
             res += ' ' * (need_space and need_space_) + line
             need_space = need_space_
     else:
         e = doc['e']
         if e in ['blockquote', 'code', 'spoilertext']:
             res = ''
         elif e in ['br', 'hr', 'table']:
             res = '\n'
         elif e == 'li':
             res = utils.norm_text2(assemble_text(doc['c'])) + '\n'
             isli = True
         elif e == 'link':
             res = utils.norm_text2(doc['t'])
             link = doc['u']
             if res.find(link) < 0:
                 res += ' (' + link + ')'
         elif e == 'list':
             res = assemble_text(doc['c'], ordered=doc['o'])
         elif e in ['par', 'h']:
             res = utils.norm_text2(assemble_text(doc['c'])) + '\n'
         elif e in ['text', 'raw'] \
         or (len(e) == 2 and e[1] == '/' and e[0] >= 'a' and e[0] <= 'z'):
             res = utils.norm_text2(doc['t'])
         else:
             from pprint import pprint
             with open('1111', 'wt', encoding='utf-8') as f:
                 pprint(doc, stream=f)
             assert 0, 'ERROR: Unknown type "{}"'.format(e)
     if res:
         res = re2.sub(' ', re3.sub('\n', res))
     return (res, isli) if isli is not None else res
示例#2
0
 def norm_(text):
     text = re6.sub(
         '',
         text.replace('\n', ' ').replace(
             '<br>',
             '\n').replace('<br/>', '\n').replace(
                 '<br />', '\n'))
     return '\n'.join(
         x for x in (' '.join(x.split()).strip()
                     for x in utils.norm_text2(
                         text).split('\n')) if x)
示例#3
0
def parse_page(page):
    text = re.sub(r'<br>', '\n', page)
    text = re.sub(r'<[^>]*>', '', text)
    text0 = []
    for line in text.split('\n'):
        #line = unescape(line).replace('\u00a0', ' ') \
        #                     .replace('\u200b', '') \
        #                     .replace('\ufeff', '') \
        #                     .replace('й', 'й').replace('ё', 'ё') \
        #                     .strip()
        line = utils.norm_text2(line)
        if line:
            text0.append(re.sub(r'\s+', ' ', line))
    return '\n'.join(text0)
示例#4
0
             f.write(page)
     if page.find('<title>Пожалуйста, войдите под своим именем пользователя') > 0 \
     or page.find('<h1 class="not-found-title">') > 0:
         continue
 else:
     if not os.path.isfile(page_fn):
         continue
     if os.path.isfile(text_fn):
         texts_total += 1
         continue
     with open(page_fn, 'rt', encoding='utf-8') as f:
         link = f.readline().rstrip()
         page = f.read()
 match = re10a.search(page)
 assert match, "ERROR: Can't find header1 on page {}".format(link)
 header = utils.norm_text2(match.group(1))
 match = re10.search(page)
 assert match, "ERROR: Can't find header2 on page {}".format(link)
 header += '\n' + utils.norm_text2(match.group(1))
 match = re11.search(page)
 assert match, "ERROR: Can't find review on page {}".format(link)
 text = match.group(1)
 text = text.replace('\n', '') \
            .replace('<em>', '').replace('</em>', '') \
            .replace('<strong>', '').replace('</strong>', '')
 text = re12a.sub(' ', text)
 if DUMP:
     with open('12a.html', 'wt', encoding='utf-8') as f:
         f.write(text)
 text = re12b.sub('</p>', text).replace('<p></p>', '') \
                               .replace('<ul>', '') \
示例#5
0
                     if len(page_fns) > 0 else \
                 0
texts_total = 0

re0a = re.compile(r'<div class="article__summary article__summary_article-page'
                  r'[^">]*">(.+?)</div>')
re0 = re.compile(r'<div class="article__text article__text_article-page'
                 r'[^">]*">(.+?)</div>')
re1 = re.compile(r'<p>((?:.|\n)*?)</p>')
re2 = re.compile(r'<.*?>')
need_enter = False
for link_no, link in enumerate(links, start=1):
    link, header = link.split('\t')
    #header = unescape(header).replace('\u200b', '') \
    #                         .replace('\ufeff', '').strip()
    header = utils.norm_text2(header)
    if texts_total >= utils.TEXTS_FOR_SOURCE:
        break
    #link = 'https://www.interfax.ru/interview/374150'
    page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no)
    text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no)
    page = None
    if link_no > start_link_idx:
        time.sleep(1)
        res = utils.get_url(link)
        page = res.text
    else:
        if not os.path.isfile(page_fn):
            continue
        if os.path.isfile(text_fn):
            texts_total += 1
示例#6
0
 if pos > 0:
     res = res[:pos]
     res = res.replace('\n', ' ')
     res = re1.sub('{img}', res)
     res = re2.sub('', res)
     res = re3.sub(
         lambda x: re3a.sub(' ', x.group(1).upper()) + ':',
         res
     )
     res = res.replace('\r', '') \
              .replace('<br>', '\n').replace('</p>', '\n')
     res = re4.sub(' ', '<' + res)
     #txt = unescape(res).replace('\u200b', '') \
     #                   .replace('\ufeff', '') \
     #                   .replace('й', 'й').replace('ё', 'ё')
     txt = utils.norm_text2(res)
     lines = []
     maybe_caption = False
     for line in [x.strip() for x in txt.split('\n')]:
         if '{img}' in line:
             maybe_caption = True
             continue
         if line and (not line.isupper() or '.' in line) \
        and not (len(line) >= 2
             and ((line[0] == '(' and line[-1] == ')')
               or (line[0] == '[' and line[-1] == ']')
               or (line[0] == '«' and line[-1] == '»'))) \
        and (not maybe_caption or not line[-1].isalnum()):
             lines.append(line.split())
         maybe_caption = False
     lines = normalize_text(lines)
示例#7
0
         continue
     if os.path.isfile(text_fn):
         texts_total += 1
         continue
     with open(page_fn, 'rt', encoding='utf-8') as f:
         link = f.readline().rstrip()
         page = f.read()
 res = re0.findall(page)
 lines, key_lines = [], 0
 issent = False
 prev_speaker, prev_strong, curr_speaker = None, None, None
 for line in res:
     #line = unescape(line).replace('\u200b', '').replace('\ufeff', '') \
     #                     .replace('й', 'й').replace('ё', 'ё') \
     #                     .replace('</strong><strong>', '')
     line = utils.norm_text2(line).replace('</strong><strong>', '')
     line = re1.sub(r'{\g<1>}', line)
     line = re2.sub('', line)
     line = re2a.sub(' ', line).strip()
     sents = [
         x.strip() for x in line.split('{strong')
         for x in x.split('/strong}')
     ]
     for sent in sents:
         if sent.startswith('}') and sent.endswith('{'):
             sent = sent[1:-1].strip()
             speaker, strong = SPEAKER_A, True
         else:
             speaker, strong = SPEAKER_B, False
         if curr_speaker:
             speaker = curr_speaker
示例#8
0
                 print(res, file=f)
                 print('===', file=f)
                 print(lines, file=f)
             #exit()
         break
 res = res[pos:]
 pos = res.find('>')
 attr = res[:pos]
 res = res[pos + 1:]
 pos = res.find(end_token)
 lines_ = res[:pos] if pos >= 0 else res
 if 'right' in attr or 'center' in attr:
     continue
 lines_ = [x for x in lines_.split('<br />') for x in x.split('<br>')]
 for line_no, line in enumerate(lines_):
     if not utils.norm_text2(line):
         continue
     #print(line)
     if line.startswith('<') and line.endswith('>'):
         if not lines:
             isbold = True
         elif not isbold:
             lines = []
             #print('== delete ==')
             continue
     elif isbold:
         lines = []
         #print('== delete ==')
         isbold = False
     line = line.replace('<strong>', '').replace('</strong>', '') \
                .replace('<em>', '').replace('</em>', '') \
示例#9
0
 book_url = book[:pos]
 book = book[pos:]
 token = "<a class='uline' href='/authors/"
 pos = book.find(token)
 assert pos >= 0, \
     'ERROR: Not found: {}\n{}\n{}'.format(url, token, book)
 book = book[pos + len(token):]
 pos = book.find("'>")
 assert pos >= 0, \
     'ERROR: Not found: {}\n{}\n{}'.format(url, token, book)
 author_url = '/authors/' + book[:pos]
 book = book[pos + 2:]
 pos = book.find('<')
 assert pos >= 0, \
     'ERROR: Not found: {}\n{}\n{}'.format(url, token, book)
 author_name = utils.norm_text2(book[:pos]).strip()
 book = book[pos:]
 token = '<div class="desc2">'
 pos = book.find(token)
 assert pos >= 0, \
     'ERROR: Not found: {}\n{}\n{}'.format(url, token, book)
 book = book[pos + len(token):]
 pos = book.find('<')
 assert pos >= 0, \
     'ERROR: Not found: {}\n{}\n{}'.format(url, token, book)
 genre = book[:pos]
 book = book[pos:]
 #pos = genre.find(',')
 #if pos > 0:
 #    genre = genre[:pos]
 token = "<div class='desc2'>Язык оригинала: "
示例#10
0
page_fns = utils.get_file_list(utils.PAGES_DIR, num_links)
start_link_idx = int(os.path.split(sorted(page_fns)[-1])[-1]
                         .replace(utils.DATA_EXT, '')) \
                     if len(page_fns) > 0 else \
                 0
texts_total = 0

re0 = re.compile(r'<div class="article__text">((?:.|\n)*?)</div>')
re1 = re.compile(r'<p>((?:.|\n)*?)</p>')
re2 = re.compile(r'<.*?>|\(.*?\)')
need_enter = False
for link_no, link in enumerate(links, start=1):
    link, header = link.split('\t')
    #header = unescape(header).replace('\u200b', '').replace('\ufeff', '') \
    #                         .replace('й', 'й').replace('ё', 'ё').strip()
    header = utils.norm_text2(header)
    if texts_total >= utils.TEXTS_FOR_SOURCE:
        break
    #link = 'https://www.interfax.ru/interview/374150'
    page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no)
    text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no)
    page = None
    if link_no > start_link_idx:
        res = utils.get_url(link)
        page = res.text
    else:
        if not os.path.isfile(page_fn):
            continue
        if os.path.isfile(text_fn):
            texts_total += 1
            continue
示例#11
0
 res = res[:pos].strip()
 assert res.endswith(';'), \
        'ERROR: No state end on page {}'.format(link)
 res = res[:-1]
 if DUMP:
     with open('1111.json', 'wt', encoding='utf-8') as f:
         f.write(res)
 state = json.loads(res)
 if DUMP:
     from pprint import pprint
     with open('1111.json', 'wt', encoding='utf-8') as f:
         pprint(state, stream=f)
 products = state.get('entities', {}).get('products')
 assert products, 'ERROR: No products in state on page {}'.format(link)
 product = products[0]
 header = utils.norm_text2(product['name'])
 text = utils.norm_text2(product['description'])
 if DUMP:
     with open('1111.txt', 'at', encoding='utf-8') as f:
         f.write(text)
 lines = [header
          ] + [x for x in (x.strip() for x in text.split('\n')) if x]
 res, text = False, None
 while len(lines) >= _utils.MIN_TEXT_LINES:
     text = '\n'.join(lines)
     text0 = re0.sub('', text)
     text1 = re1.sub('', text0)
     if any(x in 'ЀЂЃЄЅІЇЈЉЊЋЌЍЎЏѐђѓєѕіїјљњћќѝўџѠѡѢѣѤѥѦѧѨѩѪѫѬѭѮѯѰѱѲѳѴѵ'
            'ѶѷѸѹѺѻѼѽѾѿҀҁ҂҃҄҅҆҇҈҉ҊҋҌҍҎҏҐґҒғҔҕҖҗҘҙҚқҜҝҞҟҠҡҢңҤҥҦҧҨҩ'
            'ҪҫҬҭҮүҰұҲҳҴҵҶҷҸҹҺһҼҽҾҿӀӁӂӃӄӅӆӇӈӉӊӋӌӍӎӏӐӑӒӓӔӕӖӗӘәӚӛӜӝ'
            'ӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹӺӻӼӽӾӿ' for x in text0):
示例#12
0
         continue
     with open(page_fn, 'rt', encoding='utf-8') as f:
         link, p_link = f.readline().rstrip().split()
         p_link = p_link[1:-1]
         page = f.read()
 if page:
     text = re.sub(r'<br>', '\n', page)
     text = re.sub(r'<[^>]*>', '', text)
     text0 = []
     for line in text.split('\n'):
         #line = unescape(line).replace('\u00a0', ' ') \
         #                     .replace('\u200b', '') \
         #                     .replace('\ufeff', '') \
         #                     .replace('й', 'й').replace('ё', 'ё') \
         #                     .strip()
         line = utils.norm_text2(line)
         if line:
             text0.append(re.sub(r'\s+', ' ', line))
     text = '\n'.join(text0)
 if text:
     texts_total += 1
     with open(page_fn, 'wt', encoding='utf-8') as f:
         print('{} ({})'.format(link, p_link), file=f)
         f.write(page)
     with open(text_fn, 'wt', encoding='utf-8') as f:
         print('{} ({})'.format(link, p_link), file=f)
         f.write(text)
     print('\r{} (of {})'.format(
         texts_total, min(utils.TEXTS_FOR_SOURCE, num_page_links)),
           end='')
     need_enter = True
示例#13
0
         page = f.read()
     check_ignore = False
 if DUMP:
     with open('1111.html', 'wt', encoding='utf-8') as f:
         f.write(page)
 if page.find('class="item-closed-warning"') > 0 \
 or page.find('<title>Ошибка 404') > 0:
     continue
 token = '<span class="title-info-title-text" itemprop="name">'
 pos = page.find(token)
 if pos < 0:
     print('WARNING: No author on page {} (a list?)"'.format(link))
     continue
 res = page[pos + len(token):]
 pos = res.find('<')
 header = utils.norm_text2(res[:pos])
 token = '<div class="item-description-text" itemprop="description">'
 pos = res.find(token)
 if pos < 0:
     token = '<div class="item-description-html" itemprop="description">'
     pos = res.find(token)
 assert pos > 0, "ERROR: Can't find text on page {}".format(link)
 res = res[pos + len(token):]
 pos = res.find('</div>')
 text = res[:pos]
 res = res[pos:]
 if DUMP:
     with open('1111.txt', 'wt', encoding='utf-8') as f:
         f.write(text)
 text = '\n'.join([
     x for i, x in enumerate(x for x in text.split('<p>')
示例#14
0
start_link_idx = int(os.path.split(sorted(page_fns)[-1])[-1]
                         .replace(utils.DATA_EXT, '')) \
                     if len(page_fns) > 0 else \
                 0
texts_total = 0

re2 = re.compile(r'<div itemprop="articleBody" class="article-text-body">'
                 r'((?:.|\n)+?)</div>')
re0 = re.compile(r'<p>((?:.|\n)*?)</p>')
re1 = re.compile(r'<.*?>')
need_enter = False
for link_no, link in enumerate(links, start=1):
    link, header = link.split('\t')
    #header = unescape(header).replace('\u200b', '').replace('\ufeff', '') \
    #                         .replace('й', 'й').replace('ё', 'ё').strip()
    header = utils.norm_text2(header)
    if texts_total >= utils.TEXTS_FOR_SOURCE:
        break
    #link = 'https://www.interfax.ru/interview/374150'
    page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no)
    text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no)
    page = None
    if link_no > start_link_idx:
        res = utils.get_url(link)
        page = res.text
    else:
        if not os.path.isfile(page_fn):
            continue
        if os.path.isfile(text_fn):
            texts_total += 1
            continue
示例#15
0
                 r'\n\g<1>\n', text0)
             text0 = re.sub(
                 r'<div class="\w+ cxmmr5t8 oygrvhab hcukyx3x c1et5uql ii04i59q">([^>]*)</div>',
                 r'\n\g<1>\n', text0)
             text0 = re.sub(r'<div[^>]*>([^>]*)</div>', r'\g<1>', text0)
         text0 = []
         for line in text.split('\n'):
             line = line.strip()
             if line:
                 text0.append(re.sub(r'\s+', ' ', line))
         text = '\n'.join(text0)
         #text = unescape(text).replace('\u200b', '') \
         #                     .replace('\ufeff', '') \
         #                     .replace('й', 'й').replace('ё', 'ё') \
         #                     .replace('\n\n', '\n').strip()
         text = utils.norm_text2(text).replace('\n\n', '\n')
     if text:
         texts_total += 1
         with open(page_fn, 'wt', encoding='utf-8') as f:
             print(link, file=f)
             f.write(page)
         with open(text_fn, 'wt', encoding='utf-8') as f:
             print(link, file=f)
             f.write(text)
         print('\r{} (of {})'.format(
             texts_total, min(utils.TEXTS_FOR_SOURCE, num_page_links)),
               end='')
         need_enter = True
     #exit()
 if driver:
     driver.quit()
示例#16
0
==========================================================================='''
page_fns = utils.get_file_list(utils.PAGES_DIR, num_links)
start_link_idx = int(os.path.split(sorted(page_fns)[-1])[-1]
                         .replace(utils.DATA_EXT, '')) \
                     if len(page_fns) > 0 else \
                 0
texts_total = 0

re0 = re.compile(r'<p>((?:.|\n)*?)</p>')
re1 = re.compile(r'<.*?>')
need_enter = False
for link_no, link in enumerate(links, start=1):
    link, header = link.split('\t')
    #header = unescape(header).replace('\u200b', '') \
    #                         .replace('\ufeff', '').strip()
    header = utils.norm_text2(header)
    if texts_total >= utils.TEXTS_FOR_SOURCE:
        break
    #link = 'https://www.interfax.ru/interview/374150'
    page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no)
    text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no)
    page = None
    if link_no > start_link_idx:
        res = utils.get_url(link)
        page = res.text
    else:
        if not os.path.isfile(page_fn):
            continue
        if os.path.isfile(text_fn):
            texts_total += 1
            continue
示例#17
0
     wikipedia_utils.Wikipedia().articles()):
 file_no = article_nos.get(article_no)
 if file_no or file_nos:
     if file_no:
         file_nos.append(file_no)
     id_, title, page = article
     if page:
         lines = page.split('\n')
         text_lines = []
         for line in lines:
             if line and (line[-1] != '.' or line == 'См. также:'):
                 break
             text_lines.append(line)
         res = False
         while True:
             text = utils.norm_text2('\n'.join(text_lines).strip())
             text0 = re0.sub('', text)
             text1 = re1.sub('', text0)
             if text0 and len(text1) / len(text0) >= .9:
                 num_words = len(
                     [x for x in text.split() if re5.sub('', x)])
                 if num_words > MAX_CHUNK_WORDS:
                     text_lines = text_lines[:-1]
                     continue
                 if num_words >= MIN_CHUNK_WORDS:
                     res = True
             break
         if res:
             if file_no:
                 file_nos.pop()
             else: