Exemplo n.º 1
0
def recrawl():
    csor, conn = getDushuConn()

    gid = 300000

    while gid > 0:

        sql = "select id,rawUrl,content from cn_dushu_acticle where id > %d and id < %s and bookId != 49316 ORDER by id desc" % (
            gid - 1000, gid)

        print sql

        csor.execute(sql)

        id = id - 1000

        results = csor.fetchall()

        for row in results:
            content = row[2]
            # content = row[4].replace('mi', 'mo')
            id = row[0]
            url = row[1]

            dealUrl()

    csor.close()
Exemplo n.º 2
0
def recrawl():
    csor, conn = getDushuConn()

    while id > 0:

        sql = "select id,rawUrl,content from cn_dushu_acticle where id > %d and id < %s and bookId != 49316 ORDER by id desc" % (
            id - 1000, id)

        print 'begin'
        csor.execute(sql)

        time.sleep(2)

        id = id - 1000

        results = csor.fetchall()

        for row in results:
            content = row[2]
            # content = row[4].replace('mi', 'mo')
            id = row[0]
            url = row[1]
            # url = 'http://b.easou.com/w/read/85356/14075857/6.html'

            # #跳过的host
            # continue1 = False
            # for ig in ignores['hosts']:
            #     if ig in url:
            #         continue1 = True
            #         continue
            # if continue1:
            #     continue
            #
            # try:
            #     newContent,redUrl = getContentAndRedictedUrl(url)
            #
            # except Exception as e:
            #     print 'new content1',e
            #     try:
            #         newContent, redUrl = getContentAndRedictedUrl(url)
            #
            #     except Exception as e:
            #         print 'new content1', e
            #         continue
            #     except requests.exceptions.ConnectionError as er:
            #         print 'new content2', er
            #         continue
            # except requests.exceptions.ConnectionError as er:
            #     print 'new content2',er
            #     try:
            #         newContent, redUrl = getContentAndRedictedUrl(url)
            #
            #     except Exception as e:
            #         print 'new content1', e
            #         continue
            #     except requests.exceptions.ConnectionError as er:
            #         print 'new content2', er
            #         continue
            #
            # if not redUrl:
            #     continue
            #
            # #对跳转后的url,再过滤一遍
            # continue2 = False
            # for ig in ignores['hosts']:
            #     if ig in redUrl:
            #         continue2 = True
            #         continue
            # if continue2:
            #     continue
            #
            # urlHost = urlparse(redUrl).hostname
            #
            # new2 = newContent.encode('utf-8')
            # soup = getSoupByStr(new2, "utf-8")
            #
            #
            #
            # #统一清理通用噪声
            #
            # for rm in rules['common']['rm']:
            #     removeNodesFromSoup(rm, soup)  # 删除停止node
            #
            #
            # if rules.has_key(urlHost):
            #     contentRule = rules[urlHost]['content']
            #     if contentRule: #有配置正文规则
            #         specContent = soup.select(contentRule)#根据配置,抽取正文
            #         if specContent and len(specContent) > 0:
            #             del specContent[0].attrs
            #             soup = specContent[0]
            #     #不管有没有配置正文规则,都应该遍历删除rm节点
            #     if rules[urlHost]['rm'] and len(rules[urlHost]['rm']) > 0:
            #         for rm in rules[urlHost]['rm']:
            #             removeNodesFromSoup(rm, soup)#删除停止node
            #
            #     # unwrap常见无用标签
            #     for a in soup.select('a'):
            #         a.unwrap()
            #     for a in soup.select('b'):
            #         a.unwrap()
            #     for a in soup.select('font'):
            #         a.unwrap()
            #     for a in soup.select('span'):
            #         a.unwrap()
            #
            #     content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \
            #         .replace(u'</div>', '').replace(u'<div>', '')
            #
            # else:  #m没有配置任何规则,自动抽取正文
            #     print urlHost,' : ',id
            #     continue
            #     # doc = Document(unicode(soup))
            #     # content = doc.summary(html_partial=True)
            #     # urlContents[url] = content.encode('utf-8')
            #
            #
            #
            # newContent2 = cleanTailHead(urlHost, content)
            # if newContent2 != content:
            #     content = newContent2
            #
            # if content and  len(content) < 10:
            #     continue
            #
            # # newSoup = getSoupByStr(content)
            # # newSoup.select('div')[0].unwrap()
            #
            # # content = unicode(newSoup).replace(u'<body>','').replace(u'</body>','')
            # # content = content.replace(r'<p>\d+、.*</b></p>', '')
            #
            # # content = re.sub(u'<p>\d+、((?:.|\n)*?)</p>', "", content, 1)
            # content = content.replace(u'�', u'')
            # content = content.replace(u'\'', r'\'')

            content, urlHost = getAndParse(url)

            if content and urlHost:
                update(csor, conn, id, urlHost, unicode(content))
    csor.close()
Exemplo n.º 3
0
##!/usr/bin/python
# -*- coding: UTF-8 -*-

import time

import requests

from dbHelper import getDushuConn
from framework.htmlParser import getSoupByStr
from networkHelper import getContentAndRedictedUrl

csor, conn = getDushuConn()

id = 825650

lastTime = 0

sites = open(u'3dWebsite.txt', 'w')

urlContents = {}

# sites.readline()
#
# from pybloom import BloomFilter
# f = BloomFilter(capacity=1000, error_rate=0.001)
# [f.add(x) for x in range(10)]


def update(id, content):
    sql = "update cn_dushu_acticle set content = '%s' where id = %s" % \
          (content, id)