Exemplo n.º 1
0
def haripuisi_html(url):
    page = down.download(url)
    page = down.del_html(page)
    with open('page.html', 'w', encoding='utf-8') as f:
        f.write(page)
    soup = BeautifulSoup(page, 'lxml')
    return soup
Exemplo n.º 2
0
def take_word_page(word):
    html = down.download('https://kbbi.web.id/' + word.lower())
    if html is None:
        print('Сорян, слово не найдено! Печалька :(')
    else:
        with open(r'html_changing.txt', 'w', encoding='utf-8') as f:
            f.write(down.del_html(html.replace('·', '')))
        return down.del_html(html.replace('·', ''))
Exemplo n.º 3
0
def abdul_hadi_html():
    page = down.download(
        'http://www.jendelasastra.com/dapur-sastra/dapur-jendela-sastra/lain-lain/puisi-puisi-abdul-hadi-wm'
    )
    page1 = down.del_html(page)
    with open('abdul_h.html', 'w', encoding='utf-8') as f:
        f.write(page1)
    return page1
Exemplo n.º 4
0
def take_word_page(word):
    html = down.download('https://kbbi.web.id/' + word.lower())
    if html is None:
        print('Прости, Настюш, но у тебя опять упал инет:(')
    else:
        with open(r'html_changing.txt', 'w', encoding='utf-8') as f:
            f.write(down.del_html(html.replace('·', '')))
        return down.del_html(html.replace('·', ''))
Exemplo n.º 5
0
def haripuisi_insert_url(url):
    page_poem = down.download(url)
    page_poem = down.del_html(page_poem)
    soup = BeautifulSoup(page_poem, 'lxml')
    filename = re.sub('[~#&%*{}:?"|+/<>]', '', str(soup.title.string)).strip() + '.txt'
    with open('haripuisi\\' + filename, 'a', encoding='utf-8') as f:
        f.write('\n' + url)
    print(filename + ' - done')
Exemplo n.º 6
0
def haripuisi_download_poems(url):
    page_poem = down.download(url)
    page_poem = down.del_html(page_poem)
    soup = BeautifulSoup(page_poem, 'lxml')
    filename = re.sub('[~#&%*{}:?"|+/<>]', '', str(soup.title.string)).strip() + '.txt'
    get_poem = soup.find('div', {'class': 'entry-content'}).contents
    div_tag = re.compile('<div.*?>.*?</div>', re.DOTALL)
    poem = []
    for i in get_poem:
        poem_with_tags = re.sub(div_tag, '', str(i))
        poem1 = re.sub('<.*?>', '\n', poem_with_tags)
        poem.append(poem1)
    with open('haripuisi\\' + filename, 'w', encoding='utf-8') as f:
        f.write('\n'.join(poem))
    print(filename + ' - done')
Exemplo n.º 7
0
def gusmus_crouler():
    poems = []
    for n in range(1, 12):
        page = down.download('http://gusmus.net/puisi/?N={}'.format(str(n)))
        page1 = down.del_html(page)
        soup = BeautifulSoup(page1, 'lxml')
        container = soup.find_all('div', {'class': 'col-xs-12 col-sm-10 blog-content'})
        for i in container:
            regex = re.compile('<.*?>', re.DOTALL)
            text = re.sub(regex, '\n', str(i))
            text_ready = re.sub('\n{2,}', '\n', text)
            text_ready = re.sub('\xa0', ' ', text_ready)
            poems.append(text_ready.strip())
            file_name = 'Mustofa Bisri\\A. Mustofa Bisri - ' + text_ready.strip().split('\n')[0].capitalize() + '.txt'
            print(file_name)
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write('http://gusmus.net/puisi/?N={}'.format(str(n)))
                f.write(text_ready)
    print(poems)