示例#1
0
 def tujiParse2(self, info, urlSign, page, url, domain):
     encoding, picUrl, title, text, publishTime, mypos, pageNum = tupian.parse(
         url, page, info)
     images = [picUrl]
     if info.has_key('detailFenyePattern'):
         if pageNum > 1:
             for i in range(2, pageNum + 1):
                 urlNew = url.replace(info['detailFenyePattern'][0],
                                      info['detailFenyePattern'][1] % i)
                 page = getPage(urlNew)
                 if not page:
                     retStatus = self.changeStatus(urlSign, 1)
                     print 'down url:%s failed' % urlNew
                     continue
                 fenyeEncoding, fenyePicUrl, fenyeTitle, fenyeText, fenyePublishTime, fenyeMypos, fenyePageNum = tupian.parse(
                     url, page, info)
                 text = text + fenyeText
                 images.append(fenyePicUrl)
     if not publishTime:
         x = time.localtime(time.time())
         publishTime = '0000-00-00 00:00:00'
     text = text.replace('\r', '').replace('\n', ' ').replace('\t', '')
     picInfo = []
     if len(images) == 0:
         retStatus = self.changeStatus(urlSign, 1)
         print 'url:%s has no pic' % url
         return False
     for i in range(len(images)):
         picInfo.append({'picUrl': images[i], 'picDesc': '', 'pic_seq': i})
     imgJson = json.dumps(picInfo, ensure_ascii=False)
     sql  = "update tbl_content_2 set title = '%s', publishTime = '%s', status = 2, text = '%s', mypos = '%s', images = '%s' where urlSign = '%s'"\
     %(MySQLdb.escape_string(title.encode('utf-8')), publishTime.encode('utf-8'), MySQLdb.escape_string(text.encode('utf-8')), MySQLdb.escape_string(mypos.encode('utf-8')), imgJson.encode('utf-8'), urlSign.encode('utf-8'))
     print sql
     doDB(sql, 'chuilei')
     return True
示例#2
0
 def run(self, domain, info):
     sql = "select urlSign, url, domain, isAlbum from tbl_content_2 where domain = '%s' \
     and status = 0 and category='%s';" % (domain, info['category'])
     res = getDB(sql, 'chuilei')
     for item in res:
         urlSign = item[0]
         url = item[1]
         print url
         domain = item[2]
         isAlbum = item[3]
         page = getPage(url)
         #page = page.decode('utf-8','ignore')
         if not page:
             retStatus = self.changeStatus(urlSign, 1)
             print 'down url:%s failed' % url
             continue
         if isAlbum == 0:
             ret = self.tupianParse(info, urlSign, page, url, domain)
             if not ret:
                 retStatus = self.changeStatus(urlSign, 1)
                 print 'parse url:%s failed' % url
                 continue
         elif isAlbum == 1:
             ret = self.tujiParse(info, urlSign, page, url, domain)
             if not ret:
                 retStatus = self.changeStatus(urlSign, 1)
                 print 'parse url:%s failed' % url
                 continue
         elif isAlbum == 2:
             ret = self.tujiParse2(info, urlSign, page, url, domain)
             if not ret:
                 retStatus = self.changeStatus(urlSign, 1)
                 print 'parse url:%s failed' % url
                 continue
示例#3
0
 def getDetailUrl(self, fenyeUrl, info):
     page = getPage(fenyeUrl)
     detailUrlList = self.parse(fenyeUrl, page, info['urlPattern'])
     if not detailUrlList:
         print 'url:%s can not get detail page url' % fenyeUrl
         return False
     self.process(fenyeUrl, detailUrlList, info)
     return True
示例#4
0
 def getSourceFy(self, sourceUrl, info):
     fyList = []
     page = getPage(sourceUrl)
     if not page:
         return fyList
     pageNum = pe.pageNumExtract(page, info['domain'])
     fyList.append(sourceUrl)
     if pageNum > 1:
         for i in range(2, pageNum + 1):
             if info['sourceFenyePattern'][0] == "":
                 urlNew = sourceUrl + info['sourceFenyePattern'][1] % i
             else:
                 urlNew = sourceUrl.replace(
                     info['sourceFenyePattern'][0],
                     info['sourceFenyePattern'][1] % i)
             fyList.append(urlNew)
     return fyList
示例#5
0
#encoding: utf-8
import sys
import content_extract as ce
sys.path.append("../../lib")
import download

if __name__ == "__main__":
    html = download.getPage(sys.argv[1])
    enc, time, title, text = ce.parse(sys.argv[1], html)
    print "标题:" + title.encode('utf-8', 'ignore')
    print "时间:" + time.encode('utf-8', 'ignore')
    print '=' * 10
    print "内容:" + text.encode('utf-8', 'ignore')
示例#6
0
        time = strtotime(time, '')

    if webInfo.has_key('imgReplace'):
        patternList = webInfo['imgReplace']
        for picUrl in images:
            for pattern in patternList:
                picUrl = picUrl.replace(pattern[0], pattern[1])
            imgList.append(picUrl)
    else:
        imgList = images
    #print time.encode('utf-8')
    #print text.encode('utf-8')
    return encoding, title, text, time, imgList, mypos


if __name__ == "__main__":
    from original_url_sucai import webInfo
    #html = open('page.html').read()
    #enc, title, text, time, images, mypos = parse('http://www.guandongphoto.com/thread-1035924-1-11.html',html, webInfo['guandongphoto.com'])
    url = sys.argv[1]
    domain = sys.argv[2]
    html = getPage(url)
    enc, title, text, time, images, mypos = parse(url, html, webInfo[domain])
    print "标题:" + title.encode('utf-8', 'ignore')
    print "mypos:" + mypos.encode('utf-8', 'ignore')
    print "时间:" + time.encode('utf-8', 'ignore')
    print "内容:\n" + text.encode('utf-8', 'ignore')
    print '=' * 10
    print '图片:'
    print images
示例#7
0
import sys
import re
import datetime
sys.path.append("../lib")
from download import getPage
from clientSource import *

webInfo = {
    'category': '美图',
    'sourceName': '中关村在线',
    'domain': 'zol.com',
    'sourceUrl': 'http://sj.zol.com.cn/bizhi/',
    'urlPattern': 'http.*/bizhi/.*\.html',
}

client = Client()
for i in range(2, 430):
    url = "http://www.5857.com/list-11-0-0-0-0-0-%d.html" % i
    print url
    page = getPage(url)
    if not page:
        print 'down url:%s failed' % url
        continue
    client.parse(page, webInfo['sourceUrl'], webInfo['category'],
                 webInfo['sourceName'], webInfo['domain'],
                 webInfo['urlPattern'])

#page = open('./page').read()
#client = Client()
#client.parse(page, webInfo['sourceUrl'], webInfo['category'],  webInfo['sourceName'], webInfo['domain'], webInfo['urlPattern'])