示例#1
0
def getFirstPageInfo():
    url=r'http://video.sina.com.cn/news/'
#     tDir=r'e:\tmp'
#     fileName=r'sina.html'
#     filePath=os.path.join(tDir,fileName)    
    content=getHtml(url)
#     if content:    
#         fileKit.writeFileBinary(filePath, content)
#     content=fileKit.readFileBinary(filePath)
    
    vInfoList=[]
    if content:
        soup = BeautifulSoup(content, from_encoding='gbk')
        videoList=soup.find_all('div',{'suda-uatrack-key':"news_video"})
        for item in videoList:
            vInfo={}
            vInfo['vid']=item.find('div',{'class':"news-item-count"}).get('data-vid-count')
            vInfo['title']=item.get('data-title') 
            vInfo['url']=item.get('data-url')
            vInfo['thumb']=item.find('img').get('src')
            vInfo['summary']=item.find('p',{'class':"desc"}).string
            vInfo['keywords']=item.get('data-key')
            vInfo['newsid']=item.get('data-newsid')        
            vInfo['duration']=''
            vInfo['web']=ctable
#             hm=r1('(\d{2}:\d{2})',item.find('div',{'class':"news-item-time"}).string)
#             ymd=r1(r'.*?/(\d{4}-\d{2}-\d{2}).*?',vInfo['url'])
#             vInfo['loadtime']=timeformat.getTimeStamp((long)(time.mktime(time.strptime(ymd+' '+hm, '%Y-%m-%d %H:%M'))))
            try:
                subContent=getHtml(vInfo['url'])
                subSoup=BeautifulSoup(subContent, from_encoding='utf-8')
                tblock=subSoup.find('p',{'class':"channel"})
                vInfo['vtype']= tblock.find('a').string
                fblock=subSoup.find('p',{'class':"from"})
                vInfo['source']= fblock.find_all('span')[1].string.replace(u'来源:','')
#                 block1=subSoup.find('div',{'class':"relatedVido favVideo"})
#                 reList=block1.find_all('li')
#                 strList=''
#                 for i in range(len(reList)-1):
#                     strList+=reList[i].get('video-id')+','
#                 strList+=reList[len(reList)-1].get('video-id')
#                 vInfo['related']=strList
                vInfo['related']='' # related news is no needed
                block2=subSoup.find('p',{'class':"from"})
                timeStr=block2.find('em').string
                vInfo['loadtime']= timeformat.extractTimeStamp(timeStr)
                vInfoList.append(vInfo) 
                print vInfo['loadtime'],vInfo['url'] 
            except:
                print 'Error: ',vInfo['url']
#                 logging.error('Error: '+vInfo['url'])
    return vInfoList
示例#2
0
def getPageInfo(page):
#     page is a num    
# http://so.tv.sohu.com/list_p1122_p20_p3_p40_p5_p6_p73_p8_p90_p101_p110.html
    url=r'http://so.tv.sohu.com/list_p1122_p20_p3_p40_p5_p6_p73_p8_p90_p10'+str(page)+r'_p110.html'
#     tDir=r'e:\tmp'
#     fileName=r'sohu.html'
#     filePath=os.path.join(tDir,fileName)   
     
    content=getHtml(url)
     
#     if content:    
#         fileKit.writeFileBinary(filePath, content)
#     content=fileKit.readFileBinary(filePath)
    vInfoList=[] 
    if content:
        soup = BeautifulSoup(content, from_encoding='utf-8')
        soup_content=soup.find('ul', {'class':"st-list short cfix"})
        videoList=soup_content.find_all('li')        
        for item in videoList:
            vInfo={}
            st_pic_a=item.find('div',{'class':"st-pic"}).find('a')                
            vInfo['vid']=st_pic_a.get('_s_v')            
            vInfo['url']=st_pic_a.get('href')
            vInfo['title']=str(item.find('strong').find('a').string)
            vInfo['newsid']=r1(r'/n(\d+)\.',vInfo['url'])
            vInfo['thumb']=st_pic_a.find('img').get('src')
            dustr=str(st_pic_a.find('span',{'class':"maskTx"}).string)
            m = re.search(r'(\d{1,2}).*?(\d{1,2})', dustr)
            if m:
                minute=m.group(1)
                second=m.group(2)
                vInfo['duration']='{:02d}:{:02d}'.format(int(minute),int(second))
            else:
                vInfo['duration']=''
            vInfo['web']=ctable
            try:
                subContent=getHtml(vInfo['url'])  
                subSoup = BeautifulSoup(subContent,from_encoding='gbk')
                vInfo['keywords']=subSoup.find('meta',{'name':"keywords"}).get('content')                   
#                 print vInfo['keywords']
                info_con=subSoup.find('div',{'class':"info info-con"})    
                sum_p=str(info_con.find('p',{'class':"intro"}))   
                vInfo['summary']=r1(r'<p class="intro">(.*?)<a class',sum_p).replace('简介:','')   
                timeStr=''
                vInfo['vtype']=''
                vInfo['source']=''
                block1=info_con.find('ul',{'class':"u cfix"})
                if block1:
                    timeStr=str(block1.find('li').string)
                    tblock=block1.find_all('li',{'class':"h"})
                    vInfo['source']= tblock[0].string.replace(u'来源:','').strip()
                    vInfo['vtype']= str(tblock[2].find('a').string)
                else:
                    block1=subSoup.find('div',{'class':"vInfo clear"})
                    if block1:
                        timeStr=str(block1.find('div',{'class':"wdA l"}).string)            
                vInfo['loadtime']= timeformat.extractTimeStamp(timeStr)            
    #             relUrl=r'http://pl.hd.sohu.com/videolist?playlistid=6969620&pagesize=999&order=1&callback=sohuHD.play.showPlayListBox&vid=1884339'
#                 playlistId=r1(r'var playlistId="(\d+)"',subContent)
#                 relUrl=r'http://pl.hd.sohu.com/videolist?playlistid='+playlistId+r'&pagesize=999&order=1&callback=sohuHD.play.showPlayListBox&vid='+vInfo['vid']
#                 vInfo['related']= getRelatedVideo(relUrl) 
                vInfo['related']=''
                vInfoList.append(vInfo) 
                print vInfo['loadtime'],vInfo['url']
            except:
                print 'Error: ',vInfo['url']
#                 logging.error('Error: '+vInfo['url'])
    return vInfoList