def fetchNovel(link, rLS=None): if rLS==None: rLS=link chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\ requireLinkEnd='.html') content=getWebpage(link) soup=BeautifulSoup(content) book_name=''.join(soup.find('title').contents) book_name=book_name.split('_')[0] print 'collecting: ',book_name f=open(book_name+'.txt','w') f.close() count=0 for x in chapters: chapter='http://data.book.163.com'+x['href'] #print chapter,'0' content=getWebpage(chapter) soup=BeautifulSoup(content) content=soup.find('div',{'class':'bk-article-body','id':'bk-article-body'}) f=open(book_name+'.txt','a') #print chapter,'1' try: title=''.join(x.contents).encode('GBK','ignore') except: title='' if title!='' and (title[-1]!=')' or title[-3:]=='(1)'):f.write('\n\n'+title+'\n') #print chapter,'2' for y in content.findAll(text=True): if y.encode('GBK','ignore').strip()=='': continue f.write(y.encode('GBK','ignore')+'\n') f.close() #if count>5:break count+=1
def fetchNovel(link): chapters = extractLinks(link=link, requireLinkStart=link, avoidKeys=['img', 'alt', 'src'], requireLinkEnd='.html') content = getWebpage(link) soup = BeautifulSoup(content) paras = soup.findAll('div', {'class': 'paragraph'}) intro = soup.find('div', {'class': 'bookintro'}) book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title'] print 'collecting: ', book_name f = open(book_name + '.txt', 'w') f.write('intro: ') for y in intro.findAll(text=True): if y.encode('GBK', 'ignore').strip() == '': continue f.write(y.encode('GBK', 'ignore') + '\n') for x in paras: for y in x.findAll(text=True): f.write(y.encode('GBK', 'ignore') + '\n') f.close() start = int(chapters[0]['href'][len(link):-5]) end = int(chapters[-1]['href'][len(link):-5]) + 20 chapterD = {} for x in chapters: num = int(x['href'][len(link):-5]) title = x['title'] chapterD[num] = title count = 0 for i in range(start, end): chapter = link + str(i) + '.html' content = getWebpage(chapter) soup = BeautifulSoup(content) content = soup.find('div', {'id': 'zoom'}) f = open(book_name + '.txt', 'a') if i in chapterD: f.write('\n\n' + chapterD[i].encode('GBK', 'ignore') + '\n') if content == None: continue for y in content.findAll(text=True): if y.encode('GBK', 'ignore').strip() == '': continue f.write(y.encode('GBK', 'ignore') + '\n') f.close() #if count>5:break count += 1
def fetchNovel(link): chapters=extractLinks(link=link, requireLinkStart=link,avoidKeys=['img','alt','src'],requireLinkEnd='.html') content=getWebpage(link) soup=BeautifulSoup(content) paras=soup.findAll('div',{'class':'paragraph'}) intro=soup.find('div',{'class':'bookintro'}) book_name=soup.find('div',{'id':'book-cover'}).find('a')['title'] print 'collecting: ',book_name f=open(book_name+'.txt','w') f.write('intro: ') for y in intro.findAll(text=True): if y.encode('GBK','ignore').strip()=='': continue f.write(y.encode('GBK','ignore')+'\n') for x in paras: for y in x.findAll(text=True): f.write(y.encode('GBK','ignore')+'\n') f.close() start=int(chapters[0]['href'][len(link):-5]) end=int(chapters[-1]['href'][len(link):-5])+20 chapterD={} for x in chapters: num=int(x['href'][len(link):-5]) title=x['title'] chapterD[num]=title count=0 for i in range(start,end): chapter=link+str(i)+'.html' content=getWebpage(chapter) soup=BeautifulSoup(content) content=soup.find('div',{'id':'zoom'}) f=open(book_name+'.txt','a') if i in chapterD: f.write('\n\n'+chapterD[i].encode('GBK','ignore')+'\n') if content==None: continue for y in content.findAll(text=True): if y.encode('GBK','ignore').strip()=='': continue f.write(y.encode('GBK','ignore')+'\n') f.close() #if count>5:break count+=1
def fetchNovel(link, rLS=None): if rLS == None: rLS = link chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\ requireLinkEnd='.html') content = getWebpage(link) soup = BeautifulSoup(content) book_name = ''.join(soup.find('title').contents) book_name = book_name.split('_')[0] print 'collecting: ', book_name f = open(book_name + '.txt', 'w') f.close() count = 0 for x in chapters: chapter = 'http://data.book.163.com' + x['href'] #print chapter,'0' content = getWebpage(chapter) soup = BeautifulSoup(content) content = soup.find('div', { 'class': 'bk-article-body', 'id': 'bk-article-body' }) f = open(book_name + '.txt', 'a') #print chapter,'1' try: title = ''.join(x.contents).encode('GBK', 'ignore') except: title = '' if title != '' and (title[-1] != ')' or title[-3:] == '(1)'): f.write('\n\n' + title + '\n') #print chapter,'2' for y in content.findAll(text=True): if y.encode('GBK', 'ignore').strip() == '': continue f.write(y.encode('GBK', 'ignore') + '\n') f.close() #if count>5:break count += 1