def htm2txt(inf): """ extract the text context""" doc=html.document_fromstring(inf) content=doc.xpath('//*[@id="contents"]') htmls=html.tostring(content[0],False) htmls=htmls.replace('<br>','\n') htmls=htmls.replace('<p>','\n') htmls=unescape(htmls) p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline htmls=p.sub('\n',htmls) newdoc=html.document_fromstring(htmls) return newdoc.text_content()
def htm2txt(inf): """ extract the text context""" doc=html.document_fromstring(inf) content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr') htmls=html.tostring(content[0],False) htmls=htmls.replace('<br>','\n') htmls=htmls.replace('<p>','\n') htmls=htmls.replace(' ',' ') p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline htmls=p.sub('\n',htmls) newdoc=html.document_fromstring(htmls) return newdoc.text_content()
def htm2txt(inf): """ extract the text context""" doc = html.document_fromstring(inf) #content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr') content = doc.xpath('//*[@id="content"]') htmls = html.tostring(content[0], False) htmls = htmls.replace('<br>', '\n') htmls = htmls.replace('<p>', '\n') htmls = htmls.replace(' ', ' ') p = re.compile( '\n{2,}') #replace more than 2 newlines in a row into one newline htmls = p.sub('\n', htmls) newdoc = html.document_fromstring(htmls) return newdoc.text_content()
def GetSearchResults(key,useproxy=False,proxyserver='',proxyport=0,proxyuser='',proxypass=''): global SearchURL if key.strip()=='':return [] rlist=[] page=get_search_result(SearchURL,key,useproxy,proxyserver,proxyport,proxyuser,proxypass) if page==None: return None doc=html.document_fromstring(page) rtable=doc.xpath('//*[@id="searchhight"]/table') #get the main table, you could use chrome inspect element to get the xpath if len(rtable)!=0: row_list=rtable[0].findall('tr') #get the row list row_list=row_list[1:] #remove first row of caption for row in row_list: r={} col_list = row.getchildren() #get col list in each row r['bookname']=col_list[0].xpath('a')[0].text r['book_index_url']=col_list[1].xpath('a')[0].get('href') r['authorname']=col_list[2].xpath('a')[0].text r['booksize']=col_list[3].text r['lastupdatetime']=col_list[4].text r['bookstatus']=col_list[5].xpath('font')[0].text for k in r.keys(): if r[k]==None:r[k]='' rlist.append(r) return rlist else:#means the search result is a direct hit, the result page is the book portal page #rtable=doc.xpath('//*[@id="content"]/div[2]/div[2]/table') r={} try: r['bookname']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[1]/td/h1')[0].text r['bookstatus']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[4]')[0].text r['lastupdatetime']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[6]')[0].text r['authorname']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[2]/td[6]/a/b')[0].text r['book_index_url']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[4]/td/div/b/a[1]')[0].get('href') r['booksize']='' except: return [] ## for k,v in r.items(): ## print k,v for k in r.keys(): if r[k]==None:r[k]='' return [r] return []
def GetSearchResults(key,useproxy=False,proxyserver='',proxyport=0,proxyuser='',proxypass=''): global SearchURL if key.strip()=='':return [] rlist=[] page=get_search_result(SearchURL,key,useproxy,proxyserver,proxyport,proxyuser,proxypass) if page==None: return None doc=html.document_fromstring(page) rtable=doc.xpath('//*[@id="content"]/table') #get the main table, you could use chrome inspect element to get the xpath. note: for tables, end with /table, dont end with table/tbody because chrome sometime will insert tbody #you could use chorme extension xpath helper to get correct xpath because sometime chrome inspect element return wrong result if len(rtable)!=0: row_list=rtable[0].findall('tr') #get the row list row_list=row_list[1:] #remove first row of caption for row in row_list: r={} col_list = row.getchildren() #get col list in each row r['bookname']=col_list[0].xpath('a')[0].text r['book_index_url']=col_list[1].xpath('a')[0].get('href') r['book_index_url']=urlparse.urljoin(SearchURL,r['book_index_url']) r['authorname']=col_list[2].text r['booksize']=col_list[3].text r['lastupdatetime']=col_list[4].text r['bookstatus']=col_list[5].text rlist.append(r) return rlist else:#means the search result is a direct hit, the result page is the book portal page r={} try: r['bookname']=doc.xpath('//*[@id="content"]/dd[1]/h1')[0].text r['bookstatus']=doc.xpath('//*[@id="at"]/tr[1]/td[3]')[0].text #remove tbody here r['lastupdatetime']=doc.xpath('//*[@id="at"]/tr[2]/td[3]')[0].text r['authorname']=doc.xpath('//*[@id="at"]/tr[1]/td[2]')[0].text r['book_index_url']=doc.xpath("//*[@id='content']/dd[2]/div[@class='fl'][2]/p[@class='btnlinks']/a[@class='read']")[0].get('href') r['book_index_url']=urlparse.urljoin(SearchURL,r['book_index_url']) r['booksize']='' except: return [] return [r] return []
def GetBook(url, bkname='', win=None, evt=None, useproxy=False, proxyserver='', proxyport=0, proxyuser='', proxypass='', concurrent=10, mode='new', last_chapter_count=0, dmode='down', sevt=None, control=[]): """ mode is either 'new' or 'update', default is 'new', update is used to retrie the updated part dmode is either 'down' or 'stream' sevt is the event for stream if control != [] then download will stop (because list is mutable type, boolean is not) """ bb = '' cv = threading.Lock() if useproxy: proxy_info = { 'user': proxyuser, 'pass': proxypass, 'host': proxyserver, 'port': proxyport } proxy_support = urllib2.ProxyHandler({"http" : \ "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) try: up = urllib2.urlopen(url) except: return None, {'index_url': url} fs = up.read() up.close() doc = html.document_fromstring(fs) r = doc.xpath( '//*[@id="defaulthtml4"]/table' ) #get the main table, you could use chrome inspect element to get the xpath row_list = r[0].findall('tr') #get the row list clist = [] for r in row_list: for col in r.getchildren(): #get col list in each row for a in col.xpath('div/a'): #use relative xpath to locate <a> chapt_name = a.text, chapt_url = urlparse.urljoin(url, a.get('href')) clist.append({'cname': chapt_name, 'curl': chapt_url}) ccount = len(clist) if mode == 'update': if ccount <= last_chapter_count: return '', {'index_url': url} else: clist = clist[last_chapter_count:] ccount = len(clist) i = 0 Q = Queue.Queue() tr = [] for c in clist: Q.put({'url': c['curl'], 'index': i}) tr.append(-1) i += 1 tlist = [] for x in range(concurrent): tlist.append( NewDThread(Q, useproxy, proxyserver, proxyport, proxyuser, proxypass, tr, cv)) i = 0 while True: if control != []: return None, {'index_url': url} qlen = Q.qsize() if Q.empty(): Q.join() break percent = int((float(ccount - qlen) / float(ccount)) * 100) evt.Value = str(percent) + '%' wx.PostEvent(win, evt) if dmode == 'stream': if tr[i] != -1: sevt.value = clist[i]['cname'][0] + tr[i] wx.PostEvent(win, sevt) i += 1 time.sleep(1) i = 0 bb = u'' for c in clist: bb += c['cname'][0] bb += tr[i] i += 1 if not isinstance(bb, unicode): bb = bb.decode('GBK', 'ignore') evt.Value = u'下载完毕!' evt.status = 'ok' bookstate = {} bookstate['bookname'] = bkname bookstate['index_url'] = url bookstate['last_chapter_name'] = clist[-1:][0]['cname'][0] bookstate['last_update'] = datetime.today().strftime('%y-%m-%d %H:%M') bookstate['chapter_count'] = ccount wx.PostEvent(win, evt) return bb, bookstate
def GetSearchResults(key, useproxy=False, proxyserver='', proxyport=0, proxyuser='', proxypass=''): global SearchURL if key.strip() == '': return [] rlist = [] page = get_search_result(SearchURL, key, useproxy, proxyserver, proxyport, proxyuser, proxypass) if page == None: return None doc = html.document_fromstring(page) rtable = doc.xpath( '//*[@id="searchhight"]/table' ) #get the main table, you could use chrome inspect element to get the xpath if len(rtable) != 0: row_list = rtable[0].findall('tr') #get the row list row_list = row_list[1:] #remove first row of caption for row in row_list: r = {} col_list = row.getchildren() #get col list in each row r['bookname'] = col_list[0].xpath('a')[0].text r['book_index_url'] = col_list[1].xpath('a')[0].get('href') r['authorname'] = col_list[2].xpath('a')[0].text r['booksize'] = col_list[3].text r['lastupdatetime'] = col_list[4].text r['bookstatus'] = col_list[5].xpath('font')[0].text for k in r.keys(): if r[k] == None: r[k] = '' rlist.append(r) return rlist else: #means the search result is a direct hit, the result page is the book portal page #rtable=doc.xpath('//*[@id="content"]/div[2]/div[2]/table') r = {} try: r['bookname'] = doc.xpath( '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[1]/td/h1' )[0].text r['bookstatus'] = doc.xpath( '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[4]' )[0].text r['lastupdatetime'] = doc.xpath( '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[6]' )[0].text r['authorname'] = doc.xpath( '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[2]/td[6]/a/b' )[0].text r['book_index_url'] = doc.xpath( '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[4]/td/div/b/a[1]' )[0].get('href') r['booksize'] = '' except: return [] ## for k,v in r.items(): ## print k,v for k in r.keys(): if r[k] == None: r[k] = '' return [r] return []
def GetBook(url,bkname='',win=None,evt=None,useproxy=False,proxyserver='', proxyport=0,proxyuser='',proxypass='',concurrent=10, mode='new',last_chapter_count=0,dmode='down',sevt=None,control=[]): """ mode is either 'new' or 'update', default is 'new', update is used to retrie the updated part dmode is either 'down' or 'stream' sevt is the event for stream if control != [] then download will stop (because list is mutable type, boolean is not) """ bb='' cv=threading.Lock() if useproxy: proxy_info = { 'user' : proxyuser, 'pass' : proxypass, 'host' : proxyserver, 'port' : proxyport } proxy_support = urllib2.ProxyHandler({"http" : \ "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) try: up=urllib2.urlopen(url) except: return None,{'index_url':url} fs=up.read() up.close() doc=html.document_fromstring(fs) r=doc.xpath('//*[@id="defaulthtml4"]/table') #get the main table, you could use chrome inspect element to get the xpath row_list=r[0].findall('tr') #get the row list clist=[] for r in row_list: for col in r.getchildren(): #get col list in each row for a in col.xpath('div/a'): #use relative xpath to locate <a> chapt_name=a.text, chapt_url=urlparse.urljoin(url,a.get('href')) clist.append({'cname':chapt_name,'curl':chapt_url}) ccount=len(clist) if mode=='update': if ccount<=last_chapter_count: return '',{'index_url':url} else: clist=clist[last_chapter_count:] ccount=len(clist) i=0 Q=Queue.Queue() tr=[] for c in clist: Q.put({'url':c['curl'],'index':i}) tr.append(-1) i+=1 tlist=[] for x in range(concurrent): tlist.append(NewDThread(Q,useproxy,proxyserver,proxyport,proxyuser, proxypass,tr,cv)) i=0 while True: if control!=[]: return None, {'index_url':url} qlen=Q.qsize() if Q.empty(): Q.join() break percent=int((float(ccount-qlen)/float(ccount))*100) evt.Value=str(percent)+'%' wx.PostEvent(win,evt) if dmode=='stream': if tr[i] != -1: sevt.value=clist[i]['cname'][0]+tr[i] wx.PostEvent(win,sevt) i+=1 time.sleep(1) i=0 bb=u'' for c in clist: bb+=c['cname'][0] bb+=tr[i] i+=1 if not isinstance(bb,unicode): bb=bb.decode('GBK','ignore') evt.Value=u'下载完毕!' evt.status='ok' bookstate={} bookstate['bookname']=bkname bookstate['index_url']=url bookstate['last_chapter_name']=clist[-1:][0]['cname'][0] bookstate['last_update']=datetime.today().strftime('%y-%m-%d %H:%M') bookstate['chapter_count']=ccount wx.PostEvent(win,evt) return bb,bookstate