Пример #1
0
def htm2txt(inf):
    """ extract the text context"""
    doc=html.document_fromstring(inf)
    content=doc.xpath('//*[@id="contents"]')
    htmls=html.tostring(content[0],False)
    htmls=htmls.replace('<br>','\n')
    htmls=htmls.replace('<p>','\n')
    htmls=unescape(htmls)
    p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline
    htmls=p.sub('\n',htmls)
    newdoc=html.document_fromstring(htmls)
    return newdoc.text_content()
def htm2txt(inf):
    """ extract the text context"""
    doc=html.document_fromstring(inf)
    content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr')
    htmls=html.tostring(content[0],False)
    htmls=htmls.replace('<br>','\n')
    htmls=htmls.replace('<p>','\n')
    htmls=htmls.replace('&#160;',' ')
    p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline
    htmls=p.sub('\n',htmls)
    newdoc=html.document_fromstring(htmls)
    return newdoc.text_content()
Пример #3
0
def htm2txt(inf):
    """ extract the text context"""
    doc = html.document_fromstring(inf)
    #content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr')
    content = doc.xpath('//*[@id="content"]')
    htmls = html.tostring(content[0], False)
    htmls = htmls.replace('<br>', '\n')
    htmls = htmls.replace('<p>', '\n')
    htmls = htmls.replace('&#160;', ' ')
    p = re.compile(
        '\n{2,}')  #replace more than 2 newlines in a row into one newline
    htmls = p.sub('\n', htmls)
    newdoc = html.document_fromstring(htmls)
    return newdoc.text_content()
Пример #4
0
def GetSearchResults(key,useproxy=False,proxyserver='',proxyport=0,proxyuser='',proxypass=''):
    global SearchURL
    if key.strip()=='':return []
    rlist=[]
    page=get_search_result(SearchURL,key,useproxy,proxyserver,proxyport,proxyuser,proxypass)
    if page==None:
        return None
    doc=html.document_fromstring(page)
    rtable=doc.xpath('//*[@id="searchhight"]/table') #get the main table, you could use chrome inspect element to get the xpath
    if len(rtable)!=0:
        row_list=rtable[0].findall('tr') #get the row list
        row_list=row_list[1:] #remove first row of caption
        for row in row_list:
            r={}
            col_list = row.getchildren() #get col list in each row
            r['bookname']=col_list[0].xpath('a')[0].text
            r['book_index_url']=col_list[1].xpath('a')[0].get('href')
            r['authorname']=col_list[2].xpath('a')[0].text
            r['booksize']=col_list[3].text
            r['lastupdatetime']=col_list[4].text
            r['bookstatus']=col_list[5].xpath('font')[0].text
            for k in r.keys():
                if r[k]==None:r[k]=''
            rlist.append(r)
        return rlist
    else:#means the search result is a direct hit, the result page is the book portal page
        #rtable=doc.xpath('//*[@id="content"]/div[2]/div[2]/table')
        r={}
        try:
            r['bookname']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[1]/td/h1')[0].text
            r['bookstatus']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[4]')[0].text
            r['lastupdatetime']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[6]')[0].text
            r['authorname']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[2]/td[6]/a/b')[0].text
            r['book_index_url']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[4]/td/div/b/a[1]')[0].get('href')
            r['booksize']=''
        except:
            return []
##        for k,v in r.items():
##            print k,v
        for k in r.keys():
            if r[k]==None:r[k]=''
        return [r]
    return []
Пример #5
0
def GetSearchResults(key,useproxy=False,proxyserver='',proxyport=0,proxyuser='',proxypass=''):
    global SearchURL
    if key.strip()=='':return []
    rlist=[]
    page=get_search_result(SearchURL,key,useproxy,proxyserver,proxyport,proxyuser,proxypass)
    if page==None:
        return None
    doc=html.document_fromstring(page)
    rtable=doc.xpath('//*[@id="content"]/table')
    #get the main table, you could use chrome inspect element to get the xpath. note: for tables, end with /table, dont end with table/tbody because chrome sometime will insert tbody
    #you could use chorme extension xpath helper to get correct xpath because sometime chrome inspect element return wrong result
    if len(rtable)!=0:
        row_list=rtable[0].findall('tr') #get the row list
        row_list=row_list[1:] #remove first row of caption
        for row in row_list:
            r={}
            col_list = row.getchildren() #get col list in each row
            r['bookname']=col_list[0].xpath('a')[0].text
            r['book_index_url']=col_list[1].xpath('a')[0].get('href')
            r['book_index_url']=urlparse.urljoin(SearchURL,r['book_index_url'])
            r['authorname']=col_list[2].text
            r['booksize']=col_list[3].text
            r['lastupdatetime']=col_list[4].text
            r['bookstatus']=col_list[5].text
            rlist.append(r)
        return rlist


    else:#means the search result is a direct hit, the result page is the book portal page
        r={}
        try:
            r['bookname']=doc.xpath('//*[@id="content"]/dd[1]/h1')[0].text
            r['bookstatus']=doc.xpath('//*[@id="at"]/tr[1]/td[3]')[0].text #remove tbody here
            r['lastupdatetime']=doc.xpath('//*[@id="at"]/tr[2]/td[3]')[0].text
            r['authorname']=doc.xpath('//*[@id="at"]/tr[1]/td[2]')[0].text
            r['book_index_url']=doc.xpath("//*[@id='content']/dd[2]/div[@class='fl'][2]/p[@class='btnlinks']/a[@class='read']")[0].get('href')
            r['book_index_url']=urlparse.urljoin(SearchURL,r['book_index_url'])
            r['booksize']=''
        except:
            return []
        return [r]
    return []
Пример #6
0
def GetBook(url,
            bkname='',
            win=None,
            evt=None,
            useproxy=False,
            proxyserver='',
            proxyport=0,
            proxyuser='',
            proxypass='',
            concurrent=10,
            mode='new',
            last_chapter_count=0,
            dmode='down',
            sevt=None,
            control=[]):
    """
    mode is either 'new' or 'update', default is 'new', update is used to
    retrie the updated part
    dmode is either 'down' or 'stream'
    sevt is the event for stream
    if control != [] then download will stop (because list is mutable type, boolean is not)
    """
    bb = ''
    cv = threading.Lock()
    if useproxy:
        proxy_info = {
            'user': proxyuser,
            'pass': proxypass,
            'host': proxyserver,
            'port': proxyport
        }
        proxy_support = urllib2.ProxyHandler({"http" : \
        "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
        opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)
    try:
        up = urllib2.urlopen(url)
    except:
        return None, {'index_url': url}
    fs = up.read()

    up.close()
    doc = html.document_fromstring(fs)
    r = doc.xpath(
        '//*[@id="defaulthtml4"]/table'
    )  #get the main table, you could use chrome inspect element to get the xpath
    row_list = r[0].findall('tr')  #get the row list
    clist = []
    for r in row_list:
        for col in r.getchildren():  #get col list in each row
            for a in col.xpath('div/a'):  #use relative xpath to locate <a>
                chapt_name = a.text,
                chapt_url = urlparse.urljoin(url, a.get('href'))
                clist.append({'cname': chapt_name, 'curl': chapt_url})
    ccount = len(clist)
    if mode == 'update':
        if ccount <= last_chapter_count:

            return '', {'index_url': url}
        else:
            clist = clist[last_chapter_count:]
            ccount = len(clist)
    i = 0
    Q = Queue.Queue()
    tr = []
    for c in clist:
        Q.put({'url': c['curl'], 'index': i})
        tr.append(-1)
        i += 1
    tlist = []
    for x in range(concurrent):
        tlist.append(
            NewDThread(Q, useproxy, proxyserver, proxyport, proxyuser,
                       proxypass, tr, cv))
    i = 0
    while True:
        if control != []:
            return None, {'index_url': url}
        qlen = Q.qsize()
        if Q.empty():
            Q.join()
            break
        percent = int((float(ccount - qlen) / float(ccount)) * 100)
        evt.Value = str(percent) + '%'
        wx.PostEvent(win, evt)
        if dmode == 'stream':
            if tr[i] != -1:
                sevt.value = clist[i]['cname'][0] + tr[i]
                wx.PostEvent(win, sevt)
                i += 1
        time.sleep(1)
    i = 0
    bb = u''
    for c in clist:
        bb += c['cname'][0]
        bb += tr[i]
        i += 1

    if not isinstance(bb, unicode):
        bb = bb.decode('GBK', 'ignore')
    evt.Value = u'下载完毕!'
    evt.status = 'ok'
    bookstate = {}
    bookstate['bookname'] = bkname
    bookstate['index_url'] = url
    bookstate['last_chapter_name'] = clist[-1:][0]['cname'][0]
    bookstate['last_update'] = datetime.today().strftime('%y-%m-%d %H:%M')
    bookstate['chapter_count'] = ccount
    wx.PostEvent(win, evt)
    return bb, bookstate
Пример #7
0
def GetSearchResults(key,
                     useproxy=False,
                     proxyserver='',
                     proxyport=0,
                     proxyuser='',
                     proxypass=''):
    global SearchURL
    if key.strip() == '': return []
    rlist = []
    page = get_search_result(SearchURL, key, useproxy, proxyserver, proxyport,
                             proxyuser, proxypass)
    if page == None:
        return None
    doc = html.document_fromstring(page)
    rtable = doc.xpath(
        '//*[@id="searchhight"]/table'
    )  #get the main table, you could use chrome inspect element to get the xpath
    if len(rtable) != 0:
        row_list = rtable[0].findall('tr')  #get the row list
        row_list = row_list[1:]  #remove first row of caption
        for row in row_list:
            r = {}
            col_list = row.getchildren()  #get col list in each row
            r['bookname'] = col_list[0].xpath('a')[0].text
            r['book_index_url'] = col_list[1].xpath('a')[0].get('href')
            r['authorname'] = col_list[2].xpath('a')[0].text
            r['booksize'] = col_list[3].text
            r['lastupdatetime'] = col_list[4].text
            r['bookstatus'] = col_list[5].xpath('font')[0].text
            for k in r.keys():
                if r[k] == None: r[k] = ''
            rlist.append(r)
        return rlist
    else:  #means the search result is a direct hit, the result page is the book portal page
        #rtable=doc.xpath('//*[@id="content"]/div[2]/div[2]/table')
        r = {}
        try:
            r['bookname'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[1]/td/h1'
            )[0].text
            r['bookstatus'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[4]'
            )[0].text
            r['lastupdatetime'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[6]'
            )[0].text
            r['authorname'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[2]/td[6]/a/b'
            )[0].text
            r['book_index_url'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[4]/td/div/b/a[1]'
            )[0].get('href')
            r['booksize'] = ''
        except:
            return []


##        for k,v in r.items():
##            print k,v
        for k in r.keys():
            if r[k] == None: r[k] = ''
        return [r]
    return []
Пример #8
0
def GetBook(url,bkname='',win=None,evt=None,useproxy=False,proxyserver='',
            proxyport=0,proxyuser='',proxypass='',concurrent=10,
            mode='new',last_chapter_count=0,dmode='down',sevt=None,control=[]):
    """
    mode is either 'new' or 'update', default is 'new', update is used to
    retrie the updated part
    dmode is either 'down' or 'stream'
    sevt is the event for stream
    if control != [] then download will stop (because list is mutable type, boolean is not)
    """
    bb=''
    cv=threading.Lock()
    if useproxy:
        proxy_info = {
            'user' : proxyuser,
            'pass' : proxypass,
            'host' : proxyserver,
            'port' : proxyport
            }
        proxy_support = urllib2.ProxyHandler({"http" : \
        "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
        opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)
    try:
        up=urllib2.urlopen(url)
    except:
        return None,{'index_url':url}
    fs=up.read()

    up.close()
    doc=html.document_fromstring(fs)
    r=doc.xpath('//*[@id="defaulthtml4"]/table') #get the main table, you could use chrome inspect element to get the xpath
    row_list=r[0].findall('tr') #get the row list
    clist=[]
    for r in row_list:
        for col in r.getchildren(): #get col list in each row
            for a in col.xpath('div/a'): #use relative xpath to locate <a>
                chapt_name=a.text,
                chapt_url=urlparse.urljoin(url,a.get('href'))
                clist.append({'cname':chapt_name,'curl':chapt_url})
    ccount=len(clist)
    if mode=='update':
        if ccount<=last_chapter_count:

            return '',{'index_url':url}
        else:
            clist=clist[last_chapter_count:]
            ccount=len(clist)
    i=0
    Q=Queue.Queue()
    tr=[]
    for c in clist:
        Q.put({'url':c['curl'],'index':i})
        tr.append(-1)
        i+=1
    tlist=[]
    for x in range(concurrent):
        tlist.append(NewDThread(Q,useproxy,proxyserver,proxyport,proxyuser,
                                proxypass,tr,cv))
    i=0
    while True:
        if control!=[]:
            return None, {'index_url':url}
        qlen=Q.qsize()
        if Q.empty():
            Q.join()
            break
        percent=int((float(ccount-qlen)/float(ccount))*100)
        evt.Value=str(percent)+'%'
        wx.PostEvent(win,evt)
        if dmode=='stream':
            if tr[i] != -1:
                sevt.value=clist[i]['cname'][0]+tr[i]
                wx.PostEvent(win,sevt)
                i+=1
        time.sleep(1)
    i=0
    bb=u''
    for c in clist:
        bb+=c['cname'][0]
        bb+=tr[i]
        i+=1


    if not isinstance(bb,unicode):
        bb=bb.decode('GBK','ignore')
    evt.Value=u'下载完毕!'
    evt.status='ok'
    bookstate={}
    bookstate['bookname']=bkname
    bookstate['index_url']=url
    bookstate['last_chapter_name']=clist[-1:][0]['cname'][0]
    bookstate['last_update']=datetime.today().strftime('%y-%m-%d %H:%M')
    bookstate['chapter_count']=ccount
    wx.PostEvent(win,evt)
    return bb,bookstate