/
Create_epub_from_novelfull.com.py
103 lines (85 loc) · 3.58 KB
/
Create_epub_from_novelfull.com.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from bs4 import BeautifulSoup
import requests
import os
from ebooklib import epub
import re
import pathlib
# insert here chapters, which you want to parse
start_chapter = 1
end_chapter = 1498 # 1498 - final chapter for Release that Witch
def get_html(url):
r = requests.get(url)
return r.text
def delete_old_files():
for (dirpath, dirnames, filenames) in os.walk('.\\files_for_epub'):
for filename in filenames:
if filename.startswith('Chapter'):
path_to_filename = os.getcwd() + '.\\files_for_epub\\' + filename
os.remove(path_to_filename)
def get_page_data(html):
# parse text to files
soup = BeautifulSoup(html, 'lxml')
chapter = soup.find('div', id='chapter-content')
global novel_name
novel_name = soup.find('a', class_='truyen-title').get('title')
chapter_name = soup.find('a', class_='chapter-title').get('title')
file_name = re.search(r'Chapter \d+', chapter_name).group(0)
with open('.\\files_for_epub\\' + file_name + '.xhtml', "w", encoding='utf-8') as file:
file.write(str(chapter))
def edit_files():
# removes ad blocks and last line
for (dirpath, dirnames, filenames) in os.walk('.\\files_for_epub'):
for filename in filenames:
path = os.getcwd() + '.\\files_for_epub\\' + filename
with open(path, encoding='utf-8') as f:
text = f.read()
newtext_iter1 = re.sub(r'<script.+?</script>', '', text, flags=re.MULTILINE)
newtext_iter2 = re.sub(r'<ins.+?</ins>', '', newtext_iter1, flags=re.MULTILINE)
newtext_iter3 = re.sub(r'<script>\n\n.+?\n\n</script>', '', newtext_iter2, flags=re.MULTILINE)
finalnewtext = re.sub(r'If you find any errors.+?as possible.', '', newtext_iter3)
with open(path, 'w', encoding='utf-8') as f:
f.write(finalnewtext)
def create_epub():
# a bit of epub magic
book = epub.EpubBook()
book.set_identifier(f'{start_chapter}-{end_chapter}')
book.set_title(f'{novel_name}. Chapters {start_chapter}-{end_chapter}')
book.set_language('en')
book.spine = ['nav']
for (dirpath, dirnames, filenames) in os.walk('.\\files_for_epub'):
for filename in filenames:
filename_short = filename.split('.')[0]
f = open('.\\files_for_epub\\' + filename_short + '.xhtml', encoding='utf-8')
text = f.read()
f.close()
c1 = epub.EpubHtml(title=filename_short, file_name=filename, lang='en')
c1.content = text
book.add_item(c1)
book.toc.append(c1)
book.spine.append(c1)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub.write_epub(f'{novel_name}. Chapters {start_chapter}-{end_chapter}.epub', book, {})
def delete_temporary_folder():
dir = pathlib.Path('.\\files_for_epub')
for item in dir.iterdir():
if item.is_dir():
pathlib.Path.rmdir(item)
else:
item.unlink()
dir.rmdir()
def main():
# example_url = "http://novelfull.com/release-that-witch/chapter-1200.html"
base_url = "http://novelfull.com/release-that-witch/chapter-"
ending = ".html"
pathlib.Path('.\\files_for_epub').mkdir(parents=True, exist_ok=True) # create directory if not exists
delete_old_files()
for i in range(start_chapter, end_chapter + 1): # parsing cycle
url_gen = base_url + str(i) + ending
html = get_html(url_gen)
get_page_data(html)
edit_files()
create_epub()
delete_temporary_folder()
if __name__ == '__main__':
main()