/
novels.py
85 lines (65 loc) · 2.94 KB
/
novels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import requests
from lxml import html
import re
import pdfkit
import string
import os
def reader(link,start, last,folder):
#link = "https://www.wuxiaworld.com/novel/against-the-gods/atg-chapter-0"
found =""
f = requests.get(link)
page = html.fromstring(f.text)
p = html.tostring(page).decode('utf-8')
p = p.replace("“" , '"')
p = p.replace("…" , "...")
p = p.replace("”" , '"')
p = p.replace("’" , "'")
p = p.replace("–" , "-")
lines = p.splitlines()
for i in range(len(lines)):
if lines[i] == '<div class="fr-view">':
found = lines[i+1]
break
for i in range(len(lines)):
if '/images/arrow-right.png' in lines[i]:
nextchap = lines[i-1]
nextchap = re.search('"(.*)" class', nextchap).group(1)
nextchap = "https://www.wuxiaworld.com" + nextchap
break
found = found.replace("</p><p>" , "\n\n")
found = found.replace("<p>" , "")
found = found.replace("</p>" , "")
found = found.replace("<strong>" , "")
found = found.replace("</strong>" , "")
name = found.splitlines()[0]
if(("Chapter" not in name) or ("Previous" in name) or (len(name) > 45)):
name = "Chapter "+str(start)
name = name.translate(str.maketrans('', '', string.punctuation))
found = '\n'.join(found.splitlines()[1:])
found = '\n'.join(found.splitlines()[:-3])
file = open(folder + "/" +name + ".html", "w+")
file.write(r'<style>p { font-family: Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif; font-size: 20px; font-style: normal; font-variant: normal; font-weight: 400; line-height: 25px; }</style>')
file.write("<h2><strong><center>" + name + "</center></strong></h2>" + "<br>")
file.write("<p>" + found.replace("\n\n" , "</p><p>") + "</p>")
file.close()
options = {
'page-size': 'Executive',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
}
path_wkthmltopdf = r'C:\\Program Files\\wkhtmltopdf\bin\\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
pdfkit.from_file(folder + r'/' +name + ".html",folder + r'/' + name + ".pdf", options = options,configuration=config)
os.remove(folder + "/" +name + ".html")
if(start == last):
return False
else:
print(nextchap)
return reader(nextchap,start+1,last,folder)
folder = str(input())
link = str(input())
start = int(input())
last = int(input())
print(reader(link,start,last,folder))