Exemplo n.º 1
0
        #                                  .replace('ё', 'ё') \
        #                                  .strip()
        line = utils.norm_text2(re1.sub('', line))
        if line:
            lines.append(' '.join(line.split()))
    if len(lines) >= _utils.MIN_TEXT_LINES:
        texts_total += 1
        if link_no > start_link_idx:
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            print(header, file=f)
            f.write('\n'.join(lines))
        print('\r{} (of {})'.format(texts_total,
                                    min(utils.TEXTS_FOR_SOURCE, num_links)),
              end='')
        need_enter = True
    #exit()
if need_enter:
    print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(num_links)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(num_links, isdialog=False)
Exemplo n.º 2
0
                text = None
            break
        if not res:
            if not SILENT:
                if not text:
                    print('no text')
                    #if nop:
                    #    exit()
                else:
                    print('text beyond limits:')
                    print(text)
            continue
        texts_total += 1
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write(text)
        print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE),
              end='')
        need_enter = True
        #exit()
    if need_enter:
        print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(utils.TEXTS_FOR_SOURCE)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(utils.TEXTS_FOR_SOURCE, isdialog=False)
Exemplo n.º 3
0
            #text = unescape(text).replace('\u200b', '') \
            #                     .replace('\ufeff', '') \
            #                     .replace('й', 'й').replace('ё', 'ё') \
            #                     .replace('\n\n', '\n').strip()
            text = utils.norm_text2(text).replace('\n\n', '\n')
        if text:
            texts_total += 1
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
            with open(text_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(text)
            print('\r{} (of {})'.format(
                texts_total, min(utils.TEXTS_FOR_SOURCE, num_page_links)),
                  end='')
            need_enter = True
        #exit()
    if driver:
        driver.quit()
    if need_enter:
        print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(num_page_links)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(num_page_links, isdialog=False)
Exemplo n.º 4
0
                sent = speaker + '\t' + ' '.join(sent.split())
                lines.append(sent)
                issent = False
                if speaker:
                    prev_speaker, prev_strong = speaker, strong
                curr_speaker = None
    if key_lines >= _utils.MIN_TEXT_LINES:
        texts_total += 1
        if link_no > start_link_idx:
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write('\n'.join(lines))
        print('\r{} (of {})'.format(texts_total,
                                    min(utils.TEXTS_FOR_SOURCE, num_links)),
              end='')
        need_enter = True
    #exit()
if need_enter:
    print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(num_links)  #, moderator=SPEAKER_A)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(num_links)
Exemplo n.º 5
0
                            print('{}\t{}'.format(author_id, author), file=f)
                        authors_ignore[author_id] = author
                        texts_total += 1
                        need_enter = True
                        break
                if texts_total > utils.TEXTS_FOR_SOURCE:
                    raise OverflowError()
    except OverflowError:
        pass
if need_enter:
    print()

if os.path.isfile(utils.get_data_path(utils.CHUNKS_DIR, MAX_FILES, 1)):
    print('WARNING: Chunks are already exist. '
          'Delete them if you want to recreate')
    exit()

page_fns = utils.get_file_list(utils.PAGES_DIR, MAX_FILES)
text_fns = utils.get_file_list(utils.TEXTS_DIR, MAX_FILES)
assert len(page_fns) == len(text_fns)
#new_order = utils.shuffle_file_list(page_fns)
utils.shuffle_file_list(text_fns, new_order=None)
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(MAX_FILES)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(MAX_FILES, isdialog=False)
Exemplo n.º 6
0
                if not text:
                    print('no text')
                    #if nop:
                    #    exit()
                else:
                    print('text beyond limits:')
                    print(text)
            continue
        texts_total += 1
        if link_no > start_link_idx:
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write(text)
        print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE),
              end='')
        need_enter = True
        #exit()
    if need_enter:
        print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(MAX_PAGE)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(MAX_PAGE, isdialog=False)
Exemplo n.º 7
0
                text = None
            break
        if not res:
            if not SILENT:
                if not text:
                    print('no text')
                    #if nop:
                    #    exit()
                else:
                    print('text beyond limits:')
                    print(text)
            continue
        texts_total += 1
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write(text)
        print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE),
              end='')
        need_enter = True
        #exit()
    if need_enter:
        print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(MAX_LINKS)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(MAX_LINKS, isdialog=False, norm_punct=False)
Exemplo n.º 8
0
                lines.append(line)
    if key_lines >= MIN_TEXT_LINES:
        texts_total += 1
        if link_no > start_link_idx:
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
        if lines[-1][0] == '\t':
            lines = lines[:-1]
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write('\n'.join(lines))
        print('\r{} (of {})'.format(texts_total,
                                    min(utils.TEXTS_FOR_SOURCE, num_links)),
              end='')
        need_enter = True
    #exit()
if need_enter:
    print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(num_links,
                   trim_ending=False,
                   moderator=SPEAKER_A,
                   min_chunk_lines=MIN_CHUNK_LINES)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(num_links)