Пример #1
0
    def execute(self, newUrl, isFirst=False):
        i = 1
        self.urlManager.addOneUrl(newUrl)
        while(True):
            newUrlList = []
            newDataDict = {}
            try:
                if(self.urlManager.hasMoreUrls() > 0):
                    getOneUrl = self.urlManager.getOneUrl()
                    content = self.htmlLoader.htmlDown(getOneUrl)
                    if(content is None):
                        continue

                    if(i == 1):
                        # 解析首页
                        newUrlList = self.htmlParser.urlParse(content, getOneUrl)
                    else:
                        # 解析每个页面
                        newUrlList, newDataDict = self.htmlParser.perPageParse(content, getOneUrl)

                    # print(newUrlList)
                    if(len(newUrlList) > 0):
                        self.urlManager.addUrls(newUrlList)

                    if(len(newDataDict) > 0):
                        self.dataHandler.insert(newDataDict)
                else:
                    loggerSpider.log('has no more url')
                    break
            except Exception as e:
                loggerSpider.log(e)

            i += 1
Пример #2
0
    def urlParse(self, content, newUrl):
        hrefList = []
        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
        aList = soup.find_all('a', href=re.compile(r'http://[a-z-./]+/\d+/$', re.I))
        if(len(aList) == 0):
            loggerSpider.log('find_all is empty [%s]' % (newUrl))
        else:
            for a in aList:
                if((a.get('href', None) is not None) and (a.get('title', None) is not None)):
                    hrefList.append(a['href'])

        return hrefList
Пример #3
0
    def urlParse(self, content, newUrl):
        hrefList = []
        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
        aList = soup.find_all('a',
                              href=re.compile(r'http://[a-z-./]+/\d+/$', re.I))
        if (len(aList) == 0):
            loggerSpider.log('find_all is empty [%s]' % (newUrl))
        else:
            for a in aList:
                if ((a.get('href', None) is not None)
                        and (a.get('title', None) is not None)):
                    hrefList.append(a['href'])

        return hrefList
Пример #4
0
    def __init__(self):
        try:
            self.conn = MySQLdb.Connect(
                host='localhost',
                user='******',
                passwd='123456',
                port=3306,
                db='spider',
                charset='utf8',
            )

            self.conn.autocommit(False)
        except MySQLdb.Error as e:
            loggerSpider.log("Mysql connect error %d: %s" % (e.args[0], e.args[1]))
Пример #5
0
    def __init__(self):
        try:
            self.conn = MySQLdb.Connect(
                host='localhost',
                user='******',
                passwd='123456',
                port=3306,
                db='spider',
                charset='utf8',
            )

            self.conn.autocommit(False)
        except MySQLdb.Error as e:
            loggerSpider.log("Mysql connect error %d: %s" %
                             (e.args[0], e.args[1]))
Пример #6
0
    def htmlDown(self, newUrl):
        try:
            content = None
            headers = {
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Referer': 'http://blog.jobbole.com/',
                'Accept-Language': 'zh-CN,zh;q=0.8'
            }
            data = {}
            data = urllib.urlencode(data)

            # request
            request = urllib2.Request(newUrl, data=data, headers=headers)

            # proxy
            # proxy_handler = urllib2.ProxyHandler({"http" : 'http://192.168.0.101:3128'})
            proxy_handler = urllib2.ProxyHandler({})

            # build opener
            opener = urllib2.build_opener(proxy_handler)

            # install opener
            urllib2.install_opener(opener)

            # response
            response = opener.open(request)

            # get code
            code = response.getcode()

            if (code != 200):
                return content

            content = response.read()
            if (content is None):
                loggerSpider.log('content is None [%s]' % (newUrl))
                return content

        finally:
            return content
Пример #7
0
    def htmlDown(self, newUrl):
        try:
            content = None
            headers = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Referer': 'http://blog.jobbole.com/',
                'Accept-Language': 'zh-CN,zh;q=0.8'
            }
            data = {}
            data = urllib.urlencode(data)

            # request
            request = urllib2.Request(newUrl, data=data, headers=headers)

            # proxy
            # proxy_handler = urllib2.ProxyHandler({"http" : 'http://192.168.0.101:3128'})
            proxy_handler = urllib2.ProxyHandler({})

            # build opener
            opener = urllib2.build_opener(proxy_handler)

            # install opener
            urllib2.install_opener(opener)

            # response
            response = opener.open(request)

            # get code
            code = response.getcode()

            if(code != 200):
                return content

            content = response.read()
            if(content is None):
                loggerSpider.log('content is None [%s]' % (newUrl))
                return content

        finally:
            return content
Пример #8
0
    def perPageParse(self, content, newUrl):
        newDataDict = {}
        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
        div_node = soup.find('div', attrs={'class': 'entry-header'})
        if (div_node is None):
            loggerSpider.log('div_node is None [%s]' % (newUrl))

        h1_node = div_node.find('h1')
        if (h1_node is None):
            loggerSpider('h1_node is None [%s]' % (newUrl))
        else:
            newDataDict['title'] = h1_node.get_text()
            newDataDict['href'] = newUrl

        entry_node = soup.find('div', attrs={'class': 'entry'})
        if (entry_node is None):
            loggerSpider.log('entry_node is None [%s]' % (newUrl))
        else:
            newDataDict['content'] = entry_node.get_text()

        newUrlList = self.urlParse(content, newUrl)
        return newUrlList, newDataDict
Пример #9
0
    def perPageParse(self, content, newUrl):
        newDataDict = {}
        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
        div_node = soup.find('div', attrs={'class': 'entry-header'})
        if(div_node is None):
            loggerSpider.log('div_node is None [%s]' % (newUrl))

        h1_node = div_node.find('h1')
        if(h1_node is None):
            loggerSpider('h1_node is None [%s]' % (newUrl))
        else:
            newDataDict['title'] = h1_node.get_text()
            newDataDict['href'] = newUrl

        entry_node = soup.find('div', attrs={'class': 'entry'})
        if(entry_node is None):
            loggerSpider.log('entry_node is None [%s]' % (newUrl))
        else:
            newDataDict['content'] = entry_node.get_text()

        newUrlList = self.urlParse(content, newUrl)
        return newUrlList, newDataDict
Пример #10
0
    def addOneUrl(self, newUrl):
        if(newUrl is None):
            loggerSpider.log('newUrl is None')
            return

        if((not self.conn.sismember(self.newUrlSetName, newUrl)) and (not self.conn.sismember(self.oldUrlSetName, newUrl))):
            if(self.conn.sadd(self.newUrlSetName, newUrl) != 1):
                loggerSpider.log('newUrl add redis fail [%s]' % (newUrl))
            else:
                pass
        else:
            loggerSpider.log('newUrl is exists [%s]' % (newUrl))
Пример #11
0
    def addOneUrl(self, newUrl):
        if (newUrl is None):
            loggerSpider.log('newUrl is None')
            return

        if ((not self.conn.sismember(self.newUrlSetName, newUrl))
                and (not self.conn.sismember(self.oldUrlSetName, newUrl))):
            if (self.conn.sadd(self.newUrlSetName, newUrl) != 1):
                loggerSpider.log('newUrl add redis fail [%s]' % (newUrl))
            else:
                pass
        else:
            loggerSpider.log('newUrl is exists [%s]' % (newUrl))