Python document_fromstring示例，lxml_linux_64.html.document_fromstring Python示例

示例#1

0

显示文件

def htm2txt(inf):
    """ extract the text context"""
    doc=html.document_fromstring(inf)
    content=doc.xpath('//*[@id="contents"]')
    htmls=html.tostring(content[0],False)
    htmls=htmls.replace('<br>','\n')
    htmls=htmls.replace('<p>','\n')
    htmls=unescape(htmls)
    p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline
    htmls=p.sub('\n',htmls)
    newdoc=html.document_fromstring(htmls)
    return newdoc.text_content()

示例#2

0

显示文件

文件： www.ranwen.net.py 项目： anselmorenato/litebook-project

def htm2txt(inf):
    """ extract the text context"""
    doc=html.document_fromstring(inf)
    content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr')
    htmls=html.tostring(content[0],False)
    htmls=htmls.replace('<br>','\n')
    htmls=htmls.replace('<p>','\n')
    htmls=htmls.replace('&#160;',' ')
    p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline
    htmls=p.sub('\n',htmls)
    newdoc=html.document_fromstring(htmls)
    return newdoc.text_content()

示例#3

0

显示文件

def htm2txt(inf):
    """ extract the text context"""
    doc = html.document_fromstring(inf)
    #content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr')
    content = doc.xpath('//*[@id="content"]')
    htmls = html.tostring(content[0], False)
    htmls = htmls.replace('<br>', '\n')
    htmls = htmls.replace('<p>', '\n')
    htmls = htmls.replace('&#160;', ' ')
    p = re.compile(
        '\n{2,}')  #replace more than 2 newlines in a row into one newline
    htmls = p.sub('\n', htmls)
    newdoc = html.document_fromstring(htmls)
    return newdoc.text_content()

示例#4

0

显示文件

文件： www.ranwen.net.py 项目： andyyes/litebook

def GetSearchResults(key,useproxy=False,proxyserver='',proxyport=0,proxyuser='',proxypass=''):
    global SearchURL
    if key.strip()=='':return []
    rlist=[]
    page=get_search_result(SearchURL,key,useproxy,proxyserver,proxyport,proxyuser,proxypass)
    if page==None:
        return None
    doc=html.document_fromstring(page)
    rtable=doc.xpath('//*[@id="searchhight"]/table') #get the main table, you could use chrome inspect element to get the xpath
    if len(rtable)!=0:
        row_list=rtable[0].findall('tr') #get the row list
        row_list=row_list[1:] #remove first row of caption
        for row in row_list:
            r={}
            col_list = row.getchildren() #get col list in each row
            r['bookname']=col_list[0].xpath('a')[0].text
            r['book_index_url']=col_list[1].xpath('a')[0].get('href')
            r['authorname']=col_list[2].xpath('a')[0].text
            r['booksize']=col_list[3].text
            r['lastupdatetime']=col_list[4].text
            r['bookstatus']=col_list[5].xpath('font')[0].text
            for k in r.keys():
                if r[k]==None:r[k]=''
            rlist.append(r)
        return rlist
    else:#means the search result is a direct hit, the result page is the book portal page
        #rtable=doc.xpath('//*[@id="content"]/div[2]/div[2]/table')
        r={}
        try:
            r['bookname']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[1]/td/h1')[0].text
            r['bookstatus']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[4]')[0].text
            r['lastupdatetime']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[6]')[0].text
            r['authorname']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[2]/td[6]/a/b')[0].text
            r['book_index_url']=doc.xpath('//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[4]/td/div/b/a[1]')[0].get('href')
            r['booksize']=''
        except:
            return []
##        for k,v in r.items():
##            print k,v
        for k in r.keys():
            if r[k]==None:r[k]=''
        return [r]
    return []

示例#5

0

显示文件

def GetSearchResults(key,useproxy=False,proxyserver='',proxyport=0,proxyuser='',proxypass=''):
    global SearchURL
    if key.strip()=='':return []
    rlist=[]
    page=get_search_result(SearchURL,key,useproxy,proxyserver,proxyport,proxyuser,proxypass)
    if page==None:
        return None
    doc=html.document_fromstring(page)
    rtable=doc.xpath('//*[@id="content"]/table')
    #get the main table, you could use chrome inspect element to get the xpath. note: for tables, end with /table, dont end with table/tbody because chrome sometime will insert tbody
    #you could use chorme extension xpath helper to get correct xpath because sometime chrome inspect element return wrong result
    if len(rtable)!=0:
        row_list=rtable[0].findall('tr') #get the row list
        row_list=row_list[1:] #remove first row of caption
        for row in row_list:
            r={}
            col_list = row.getchildren() #get col list in each row
            r['bookname']=col_list[0].xpath('a')[0].text
            r['book_index_url']=col_list[1].xpath('a')[0].get('href')
            r['book_index_url']=urlparse.urljoin(SearchURL,r['book_index_url'])
            r['authorname']=col_list[2].text
            r['booksize']=col_list[3].text
            r['lastupdatetime']=col_list[4].text
            r['bookstatus']=col_list[5].text
            rlist.append(r)
        return rlist


    else:#means the search result is a direct hit, the result page is the book portal page
        r={}
        try:
            r['bookname']=doc.xpath('//*[@id="content"]/dd[1]/h1')[0].text
            r['bookstatus']=doc.xpath('//*[@id="at"]/tr[1]/td[3]')[0].text #remove tbody here
            r['lastupdatetime']=doc.xpath('//*[@id="at"]/tr[2]/td[3]')[0].text
            r['authorname']=doc.xpath('//*[@id="at"]/tr[1]/td[2]')[0].text
            r['book_index_url']=doc.xpath("//*[@id='content']/dd[2]/div[@class='fl'][2]/p[@class='btnlinks']/a[@class='read']")[0].get('href')
            r['book_index_url']=urlparse.urljoin(SearchURL,r['book_index_url'])
            r['booksize']=''
        except:
            return []
        return [r]
    return []

示例#6

0

显示文件

def GetBook(url,
            bkname='',
            win=None,
            evt=None,
            useproxy=False,
            proxyserver='',
            proxyport=0,
            proxyuser='',
            proxypass='',
            concurrent=10,
            mode='new',
            last_chapter_count=0,
            dmode='down',
            sevt=None,
            control=[]):
    """
    mode is either 'new' or 'update', default is 'new', update is used to
    retrie the updated part
    dmode is either 'down' or 'stream'
    sevt is the event for stream
    if control != [] then download will stop (because list is mutable type, boolean is not)
    """
    bb = ''
    cv = threading.Lock()
    if useproxy:
        proxy_info = {
            'user': proxyuser,
            'pass': proxypass,
            'host': proxyserver,
            'port': proxyport
        }
        proxy_support = urllib2.ProxyHandler({"http" : \
        "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
        opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)
    try:
        up = urllib2.urlopen(url)
    except:
        return None, {'index_url': url}
    fs = up.read()

    up.close()
    doc = html.document_fromstring(fs)
    r = doc.xpath(
        '//*[@id="defaulthtml4"]/table'
    )  #get the main table, you could use chrome inspect element to get the xpath
    row_list = r[0].findall('tr')  #get the row list
    clist = []
    for r in row_list:
        for col in r.getchildren():  #get col list in each row
            for a in col.xpath('div/a'):  #use relative xpath to locate <a>
                chapt_name = a.text,
                chapt_url = urlparse.urljoin(url, a.get('href'))
                clist.append({'cname': chapt_name, 'curl': chapt_url})
    ccount = len(clist)
    if mode == 'update':
        if ccount <= last_chapter_count:

            return '', {'index_url': url}
        else:
            clist = clist[last_chapter_count:]
            ccount = len(clist)
    i = 0
    Q = Queue.Queue()
    tr = []
    for c in clist:
        Q.put({'url': c['curl'], 'index': i})
        tr.append(-1)
        i += 1
    tlist = []
    for x in range(concurrent):
        tlist.append(
            NewDThread(Q, useproxy, proxyserver, proxyport, proxyuser,
                       proxypass, tr, cv))
    i = 0
    while True:
        if control != []:
            return None, {'index_url': url}
        qlen = Q.qsize()
        if Q.empty():
            Q.join()
            break
        percent = int((float(ccount - qlen) / float(ccount)) * 100)
        evt.Value = str(percent) + '%'
        wx.PostEvent(win, evt)
        if dmode == 'stream':
            if tr[i] != -1:
                sevt.value = clist[i]['cname'][0] + tr[i]
                wx.PostEvent(win, sevt)
                i += 1
        time.sleep(1)
    i = 0
    bb = u''
    for c in clist:
        bb += c['cname'][0]
        bb += tr[i]
        i += 1

    if not isinstance(bb, unicode):
        bb = bb.decode('GBK', 'ignore')
    evt.Value = u'下载完毕!'
    evt.status = 'ok'
    bookstate = {}
    bookstate['bookname'] = bkname
    bookstate['index_url'] = url
    bookstate['last_chapter_name'] = clist[-1:][0]['cname'][0]
    bookstate['last_update'] = datetime.today().strftime('%y-%m-%d %H:%M')
    bookstate['chapter_count'] = ccount
    wx.PostEvent(win, evt)
    return bb, bookstate

示例#7

0

显示文件

def GetSearchResults(key,
                     useproxy=False,
                     proxyserver='',
                     proxyport=0,
                     proxyuser='',
                     proxypass=''):
    global SearchURL
    if key.strip() == '': return []
    rlist = []
    page = get_search_result(SearchURL, key, useproxy, proxyserver, proxyport,
                             proxyuser, proxypass)
    if page == None:
        return None
    doc = html.document_fromstring(page)
    rtable = doc.xpath(
        '//*[@id="searchhight"]/table'
    )  #get the main table, you could use chrome inspect element to get the xpath
    if len(rtable) != 0:
        row_list = rtable[0].findall('tr')  #get the row list
        row_list = row_list[1:]  #remove first row of caption
        for row in row_list:
            r = {}
            col_list = row.getchildren()  #get col list in each row
            r['bookname'] = col_list[0].xpath('a')[0].text
            r['book_index_url'] = col_list[1].xpath('a')[0].get('href')
            r['authorname'] = col_list[2].xpath('a')[0].text
            r['booksize'] = col_list[3].text
            r['lastupdatetime'] = col_list[4].text
            r['bookstatus'] = col_list[5].xpath('font')[0].text
            for k in r.keys():
                if r[k] == None: r[k] = ''
            rlist.append(r)
        return rlist
    else:  #means the search result is a direct hit, the result page is the book portal page
        #rtable=doc.xpath('//*[@id="content"]/div[2]/div[2]/table')
        r = {}
        try:
            r['bookname'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[1]/td/h1'
            )[0].text
            r['bookstatus'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[4]'
            )[0].text
            r['lastupdatetime'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[1]/td[6]'
            )[0].text
            r['authorname'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[2]/td[6]/a/b'
            )[0].text
            r['book_index_url'] = doc.xpath(
                '//*[@id="content"]/div[2]/div[2]/table/tr/td/table/tbody/tr[1]/td/table/tr[2]/td[2]/table/tr[4]/td/div/b/a[1]'
            )[0].get('href')
            r['booksize'] = ''
        except:
            return []


##        for k,v in r.items():
##            print k,v
        for k in r.keys():
            if r[k] == None: r[k] = ''
        return [r]
    return []

示例#8

0

显示文件

文件： www.ranwen.net.py 项目： andyyes/litebook

def GetBook(url,bkname='',win=None,evt=None,useproxy=False,proxyserver='',
            proxyport=0,proxyuser='',proxypass='',concurrent=10,
            mode='new',last_chapter_count=0,dmode='down',sevt=None,control=[]):
    """
    mode is either 'new' or 'update', default is 'new', update is used to
    retrie the updated part
    dmode is either 'down' or 'stream'
    sevt is the event for stream
    if control != [] then download will stop (because list is mutable type, boolean is not)
    """
    bb=''
    cv=threading.Lock()
    if useproxy:
        proxy_info = {
            'user' : proxyuser,
            'pass' : proxypass,
            'host' : proxyserver,
            'port' : proxyport
            }
        proxy_support = urllib2.ProxyHandler({"http" : \
        "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
        opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)
    try:
        up=urllib2.urlopen(url)
    except:
        return None,{'index_url':url}
    fs=up.read()

    up.close()
    doc=html.document_fromstring(fs)
    r=doc.xpath('//*[@id="defaulthtml4"]/table') #get the main table, you could use chrome inspect element to get the xpath
    row_list=r[0].findall('tr') #get the row list
    clist=[]
    for r in row_list:
        for col in r.getchildren(): #get col list in each row
            for a in col.xpath('div/a'): #use relative xpath to locate <a>
                chapt_name=a.text,
                chapt_url=urlparse.urljoin(url,a.get('href'))
                clist.append({'cname':chapt_name,'curl':chapt_url})
    ccount=len(clist)
    if mode=='update':
        if ccount<=last_chapter_count:

            return '',{'index_url':url}
        else:
            clist=clist[last_chapter_count:]
            ccount=len(clist)
    i=0
    Q=Queue.Queue()
    tr=[]
    for c in clist:
        Q.put({'url':c['curl'],'index':i})
        tr.append(-1)
        i+=1
    tlist=[]
    for x in range(concurrent):
        tlist.append(NewDThread(Q,useproxy,proxyserver,proxyport,proxyuser,
                                proxypass,tr,cv))
    i=0
    while True:
        if control!=[]:
            return None, {'index_url':url}
        qlen=Q.qsize()
        if Q.empty():
            Q.join()
            break
        percent=int((float(ccount-qlen)/float(ccount))*100)
        evt.Value=str(percent)+'%'
        wx.PostEvent(win,evt)
        if dmode=='stream':
            if tr[i] != -1:
                sevt.value=clist[i]['cname'][0]+tr[i]
                wx.PostEvent(win,sevt)
                i+=1
        time.sleep(1)
    i=0
    bb=u''
    for c in clist:
        bb+=c['cname'][0]
        bb+=tr[i]
        i+=1


    if not isinstance(bb,unicode):
        bb=bb.decode('GBK','ignore')
    evt.Value=u'下载完毕!'
    evt.status='ok'
    bookstate={}
    bookstate['bookname']=bkname
    bookstate['index_url']=url
    bookstate['last_chapter_name']=clist[-1:][0]['cname'][0]
    bookstate['last_update']=datetime.today().strftime('%y-%m-%d %H:%M')
    bookstate['chapter_count']=ccount
    wx.PostEvent(win,evt)
    return bb,bookstate