def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) #dot=etree.HTML(cont.content) fontfamily = '' cvlist = [] cvdic = [] cont = '' dot = '' codetext = '' badgateway = True while (badgateway): cont = requests.get(l, headers=self.headerss) dot = etree.HTML( cont.content.decode('gb18030', "ignore").encode("utf-8").decode('utf-8')) codetext = etree.tostring(dot, encoding="utf-8").decode() bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext) if bdw == []: badgateway = False else: time.sleep(1) #字体反爬虫 fontsrc = re.findall( r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext) if fontsrc != []: fontsrc = "http:" + fontsrc[0] fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '', fontsrc) fontname = re.sub('.h=my.jjwxc.net', '', fontname) fontfamily = re.sub('.woff2', '', fontname) cvdic = [] if not os.path.exists(self.path + "/Fonts/" + fontfamily + '.txt'): #解析json文件 r = requests.get('http://jjwxc.yooooo.us/' + fontfamily + '.json') fonttxt = re.sub('{"status": 0, "data": ', '', r.text) fonttxt = re.sub('}}', '}', fonttxt) cdic = json.loads(fonttxt) fonttxt = '' f = open(self.path + "/Fonts/" + fontfamily + ".txt", "w", encoding='utf-8') for s, v in cdic.items(): fonttxt = fonttxt + '&#x' + s + ';-' + v + '\n' fonttxt.strip() f.write(fonttxt) f.close() ''' #若需要下载ttf文件,可运行下方代码 fontwb=requests.get(re.sub('woff2','ttf',fontsrc)).content fontf=open(self.path+"/Fonts/"+fontfamily+'.ttf','wb') fontf.write(fontwb) fontf.close() ''' try: with open(self.path + "/Fonts/" + fontfamily + ".txt", "r", encoding='utf-8') as f: cvlist = f.readlines() for y in range(len(cvlist)): cvdic.append(cvlist[y].split('-')) cvdic = dict(cvdic) except: t = 1 if cvlist != []: fontfamily += '_c' elif fontfamily not in self.fontlist: self.fontlist.append(fontfamily) #tex:正文 tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()') #tex1:作话 tex1 = dot.xpath("//div[@class='readsmall']/text()") #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) + "#" #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() title = re.sub('&', '&', title) title = re.sub('<', '<', title) title = re.sub('>', '>', title) if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt", 'w', encoding='utf-8') #写入卷标 if self.href_list[i] in self.rollSignPlace: v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) fo.write("\r\n\r\n" + v.rstrip() + '\r\n') print("\r\n" + v + "\r\n") fo.write(title + '\r\n') #写入标题 else: fo.write("\r\n\r\n" + title + "\r\n") if len(tex) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) fo.write('下载失败!') else: #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹 if cvdic != []: for y in range(len(tex)): for s, v in cvdic.items(): if not s == 'x"/;': s = re.sub(r'&#x', r'\\u', s) s = re.sub( ';', '', s).encode('utf-8').decode('unicode_escape') tex[y] = re.sub(s, v.strip(), tex[y]) cvdic = cvlist = [] #作话在文前的情况 if str(sign) == "['readsmall']": for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": #按行写入正文 fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": fo.write(v + "\n") fo.close() self.percent += 1
def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) cont = requests.get(l, headers=self.headerss).content dot = etree.HTML( cont.decode("GB18030", "ignore").encode("utf-8", "ignore").decode('utf-8')) #tex:正文 tex = dot.xpath( "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/text()" ) #he:标题 he = dot.xpath( "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/div[2]/h2/text()" ) #tex1:作话 tex1 = dot.xpath( "//html/body/table[@id='oneboolt']/tr[2]/td[1]/div[@class='noveltext']/div[@class='readsmall']/text()" ) #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".xhtml", 'w', encoding='utf-8') fo.write('''<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"> <head><title>''' + title + '''</title> <meta charset="utf-8"/> <link href="sgc-nav.css" rel="stylesheet" type="text/css"/> </head><body>''') #写入卷标 if self.href_list[i] in self.rollSignPlace: fo.write("<h1>" + v.rstrip() + "</h1>") print("\r\n" + v + "\r\n") fo.write("<h2 id='v'><a href='" + l + "'>" + title + "</a></h2>") #写入标题 else: fo.write('<h2><a href="' + l + '">' + title + "</a></h2>") if len(he) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) #print("第"+titleOrigin[2]+"章未购买或加载失败") else: #作话在文前的情况 if str(sign) == "['readsmall']": fo.write('''<blockquote>''') for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v) if v != "": #按行写入正文 fo.write("<p>" + v + "</p>") fo.write("</blockquote>") if len(tex1) != 0: fo.write("<hr/>") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") if len(tex1) != 0: fo.write("<hr/>") fo.write('''<blockquote>''') for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v) if v != "": fo.write("<p>" + v + "</p>") if len(tex1) != 0: fo.write("</blockquote>") fo.write("</body></html>") fo.close() self.percent += 1
def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) #dot=etree.HTML(cont.content) fontfamily = '' cvlist = [] cvdic = [] cont = '' dot = '' codetext = '' badgateway = True while (badgateway): cont = requests.get(l, headers=self.headerss) dot = etree.HTML( cont.content.decode('gb18030', "ignore").encode("utf-8").decode('utf-8')) codetext = etree.tostring(dot, encoding="utf-8").decode() bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext) if bdw == []: badgateway = False else: time.sleep(1) #字体反爬虫 fontsrc = re.findall( r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext) if fontsrc != []: fontsrc = "http:" + fontsrc[0] fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '', fontsrc) fontname = re.sub('.h=my.jjwxc.net', '', fontname) fontfamily = re.sub('.woff2', '', fontname) cvdic = [] if not os.path.exists(self.path + "/Fonts/" + fontfamily + '.txt'): #解析json文件 r = requests.get('http://jjwxc.yooooo.us/' + fontfamily + '.json') fonttxt = re.sub('{"status": 0, "data": ', '', r.text) fonttxt = re.sub('}}', '}', fonttxt) cdic = json.loads(fonttxt) fonttxt = '' f = open(self.path + "/Fonts/" + fontfamily + ".txt", "w", encoding='utf-8') for s, v in cdic.items(): fonttxt = fonttxt + '&#x' + s + ';-' + v + '\n' fonttxt.strip() f.write(fonttxt) f.close() ''' #若需要下载ttf文件,可运行下方代码 fontwb=requests.get(re.sub('woff2','ttf',fontsrc)).content fontf=open(self.path+"/Fonts/"+fontfamily+'.ttf','wb') fontf.write(fontwb) fontf.close() ''' try: with open(self.path + "/Fonts/" + fontfamily + ".txt", "r", encoding='utf-8') as f: cvlist = f.readlines() for y in range(len(cvlist)): cvdic.append(cvlist[y].split('-')) cvdic = dict(cvdic) except: t = 1 if cvlist != []: fontfamily += '_c' elif fontfamily not in self.fontlist: self.fontlist.append(fontfamily) self.fontcss += '''@font-face{font-family: "%s"; src:url("%s") format('woff2'), url("../font/%s") format('woff2'), url("../font/%s.ttf") format("truetype");} .%s{font-family:"%s",serif;} ''' % (fontfamily, fontsrc, fontname, fontfamily, fontfamily, fontfamily) #tex:正文 tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()') #tex1:作话 tex1 = dot.xpath("//div[@class='readsmall']/text()") #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() title = title.strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".xhtml", 'w', encoding='utf-8') fo.write('''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>''' + title + '''</title> <meta charset="utf-8"/> <link href="sgc-nav.css" rel="stylesheet" type="text/css"/> </head><body class="''' + fontfamily + '''">''') #写入卷标 if self.href_list[i] in self.rollSignPlace: fo.write("<h1>" + v.rstrip() + "</h1>") print("\r\n" + v + "\r\n") fo.write("<h2 id='v'>" + title + "</h2>") #写入标题 else: fo.write('<h2>' + title + "</h2>") if len(tex) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) #print("第"+titleOrigin[2]+"章未购买或加载失败") else: #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹 if cvdic != []: for y in range(len(tex)): for s, v in cvdic.items(): if not s == 'x"/;': s = re.sub(r'&#x', r'\\u', s) s = re.sub( ';', '', s).encode('utf-8').decode('unicode_escape') tex[y] = re.sub(s, v.strip(), tex[y]) cvdic = cvlist = [] #作话在文前的情况 if str(sign) == "['readsmall']": fo.write('''<blockquote>''') for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v) if v != "": #按行写入正文 fo.write("<p>" + v + "</p>") fo.write("</blockquote>") if len(tex1) != 0: fo.write("<hr/>") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") if len(tex1) != 0: fo.write("<hr/>") fo.write('''<blockquote>''') for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v) if v != "": fo.write("<p>" + v + "</p>") if len(tex1) != 0: fo.write("</blockquote>") fo.write("</body></html>") fo.close() self.percent += 1
def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) cont = requests.get(l, headers=self.headerss).content dot = etree.HTML( cont.decode("GB18030", "ignore").encode("utf-8", "ignore").decode('utf-8')) fontfamily = '' cvlist = [] cvdic = [] #字体反爬虫 codetext = etree.tostring(dot, encoding="utf-8").decode() fontsrc = re.findall( r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext) if fontsrc != []: fontsrc = "http:" + fontsrc[0] fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '', fontsrc) fontname = re.sub('.h=my.jjwxc.net', '', fontname) fontfamily = re.sub('.woff2', '', fontname) try: with open(self.path + "/Fonts/" + fontfamily + ".txt", "r", encoding='utf-8') as f: cvlist = f.readlines() for y in range(len(cvlist)): cvdic.append(cvlist[y].split('-')) cvdic = dict(cvdic) except: y = 1 if not os.path.exists(self.path + "/Fonts/" + fontname): fontwb = requests.get(fontsrc).content fontf = open(self.path + "/Fonts/" + fontname, 'wb') fontf.write(fontwb) fontf.close() if cvlist != []: fontfamily = '' elif fontfamily not in self.fontlist: self.fontlist.append(fontfamily) #tex:正文 tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()') #tex1:作话 tex1 = dot.xpath("//div[@class='readsmall']/text()") #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) + "#" #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt", 'w', encoding='utf-8') #写入卷标 if self.href_list[i] in self.rollSignPlace: fo.write("\r\n\r\n" + v.rstrip() + '\r\n') print("\r\n" + v + "\r\n") fo.write(title + '\r\n') #写入标题 else: fo.write("\r\n\r\n" + title + "\r\n") if len(tex) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) fo.write('下载失败!') else: #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹 if cvdic != []: for y in range(len(tex)): for s, v in cvdic.items(): s = re.sub(r'&#x', r'\\u', s) s = re.sub(r';', '', s).encode('utf-8').decode('unicode_escape') tex[y] = re.sub(s, v.strip(), tex[y]) cvdic = cvlist = 0 #作话在文前的情况 if str(sign) == "['readsmall']": for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": #按行写入正文 fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": fo.write(v + "\n") fo.close() self.percent += 1