Пример #1
0
class Spider(threading.Thread):
    def __init__(self, master=True):
        threading.Thread.__init__(self)
        self.pagestore = PageStore()
        
        self.downloader = Downloader();
        self.connection = Connection(MONGODB_HOST, MONGODB_PORT)
        db = self.connection.download
        if master:
            db.drop_collection('downurl')
            for f, tb in ((SAVE_URL_RE_BLACK, 'save_url_black'),
                (SAVE_URL_RE_WHITE, 'save_url_white'),
                (DOWN_URL_RE_BLACK, 'down_url_black'),
                (DOWN_URL_RE_WHITE, 'down_url_white')):
                if os.path.exists(f):
                    db.drop_collection(tb)
                    logger.info('load rule:%s...' % f)
                    for s in set(open(f).readlines()):
                        s = s.strip()
                        if s:
                            db[tb].insert({'pattern': s})
                    logger.info('load rule:%s...OK' % f)
        self.downurl, self.allurl, self.watchurl, self.updateurl, self.secceedurl = db.downurl, db.allurl, db.watchurl, db.updateurl, db.secceedurl
        self.save_url_black = self.load_re(db.save_url_black)
        self.save_url_white = self.load_re(db.save_url_white)
        self.down_url_black = self.load_re(db.down_url_black)
        self.down_url_white = self.load_re(db.down_url_white)
        if master:
            self.load_watch_url()
            self.load_update_url()
            self.reload_allurl()
            logger.info('allurl:%d' % self.allurl.find().count())
            logger.info('secceedurl:%d' % self.secceedurl.find().count())
            logger.info('updateurl:%d' % self.updateurl.find().count())
            logger.info('watchurl:%d' % self.watchurl.find().count())
            logger.info('downurl:%d' % self.downurl.find().count())
        
    def load_re(self, tb):
        s = set([r['pattern'] for r in tb.find()])
        return [re.compile(s) for r in s]
    def get_safe_utf8(self, s):
        if isinstance(s, str):
            return s
        else:
            return s.encode('utf-8', 'ignore')
    def getmd5(self, s):
        m = md5.new()
        m.update(self.get_safe_utf8(s))
        return m.hexdigest()

    def get_one_task(self, tb):
        row = tb.find_and_modify(remove=True)
        if not row:return None
        row = self.allurl.find_one(row)
        return row['url'] if row else None

    def add_one_task(self, url, tb):
        s = url.lower()
        if s.startswith('http://') or s.startswith('https://'):
            k = self.getmd5(s)
            self.allurl.insert({'url': url, '_id':k})
            tb.insert({'_id': k})

    def load_watch_url(self):
        if not os.path.exists(WATCH_URL_FILE):
            return
        logger.info('load watch urls...')
        with open(WATCH_URL_FILE) as f:
            while True:
                url = f.readline()
                if not url:break
                self.add_one_task(url.strip(), self.watchurl)
        logger.info('load watch urls...%d' % self.watchurl.count())
    
    def normal_url(self, url):
        u = urlparse(url)
        if u.fragment:
            return url[:-(len(u.fragment) + 1)]
        return url
    
    def load_update_url(self):
        if not os.path.exists(UPDATE_URL_FILE):
            return
        logger.info('load update urls...')
        with open(UPDATE_URL_FILE) as f:
            while True:
                url = f.readline()
                if not url:break
                self.add_one_task(url.strip(), self.updateurl)
        logger.info('load update urls...%d' % self.updateurl.count())
    
    def check_url(self, url, black, white):
        for p in black:
            if p.search(url):
                return False
        if not white:
            return True
        for p in white:
            if p.search(url):
                return True
        return False
    
    def check_add_new_task(self, url):
        s = url.lower()
        #error url
        if not s.startswith('http://') and not s.startswith('https://'):
            return False
        #don't save url
        if not self.check_url(url, self.save_url_black, self.save_url_white):
            return False
        k = self.getmd5(s)
        #already save
        if self.allurl.find({'_id':k}).count():
            return False
        self.allurl.insert({'url': url, '_id':k})
        
        #dont't down
        if not self.check_url(url, self.down_url_black, self.down_url_white):
            return False
        
        #already down succeed
        if self.secceedurl.find({'_id':k}).count():
            return False
        self.downurl.insert({'_id': k})
        return True
    
    def reload_allurl(self):
        logger.info('reload all url...')
        for row in self.allurl.find():
            k, url = row['_id'], row['url']
            if not self.check_url(url, self.down_url_black, self.down_url_white):
                continue
            if self.secceedurl.find({'_id':k}).count():
                continue
            self.downurl.insert({'_id':k})
        logger.info('reload all url...%d ' % self.downurl.find().count())
    
    def detect_html(self, html):
        if not html:return None
        try:
            return html.decode('utf-8')
        except:
            return html.decode('gbk', 'ignore')
    
    def process_url(self, url):
        html, redirect, code = self.downloader.fetch(self.get_safe_utf8(url))
        if code == 200:
            html = self.detect_html(html)
            for href in self.link_parse(html, redirect):
                try:
                    self.check_add_new_task(href)
                except Exception as e:
                    logger.exception('%s,%s:%s' % (type(href), href, e.message))
            for k in set([self.getmd5(url.lower()), self.getmd5(redirect.lower())]):
                self.secceedurl.insert({'_id': k})
            if html:
                self.pagestore.succeed(url, html)
                return True
        return False
    
    def link_parse(self, html, base):
        urls = set()
        if not html or not base:return urls
        soup = BeautifulSoup(html)
        for a in soup.findAll('a'):
            href = a.get('href')
            if not href:continue
            if href in urls:continue
            href = self.normal_url(self.get_safe_utf8(urljoin(base, self.get_safe_utf8(href))))
            urls.add(href)
        return urls
    
    def get_url_block(self):
        while True:
            for tb in (self.watchurl, self.downurl, self.updateurl):
                url = self.get_one_task(tb)
                if url:return url
            logger.info('no any task')
            time.sleep(1)
        
    def proce_one_url(self):
        url = self.get_url_block()
        logger.info('down:%s' % url)
        ret = False
        try:
            ret = self.process_url(url)
        except Exception as e:
            logger.exception('url:%s %s' % (url, e.message))
        if not ret:
            self.pagestore.failed(url)

    def run(self):
        while True:
            try:
                while True:
                    self.proce_one_url()
            except Exception,e:
                logger.exception(e.message)
                time.sleep(1)
Пример #2
0
class Spider():
    def __init__(self):
        self.today = time.strftime("%Y-%m-%d",time.localtime(time.time()))
        self.urllogpath = "../data/url"
        os.system("mkdir -p %s" % self.urllogpath)
        self.urllog = "../data/url/" + "downloadedurl_" + self.today + ".txt"
        self.subpagepath = "../data/subpagepath"
        os.system("mkdir -p %s" % self.subpagepath)
        
        self.baseurl = BASEURLS 
        self.suburl = {}
        self.downloader = Downloader()
        self.html2db = Html2db()

    def get_safe_utf8(self,s):
        if isinstance(s,str):
            return s
        else:
            return s.encode('utf-8','ignore')
    
    def detect_html(self,html):
        if not html:return None
        try:
            return html.decode('utf-8')
        except:
            return html.decode('gbk','ignore')
    
    def normal_url(self,url):
        u = urlparse(url)
        if u.fragment:
            return url[:-(len(u.fragment) + 1)]
        return url
    
    def link_parse(self,html,base):
        if not html or not base: return urls
        soup = BeautifulSoup(html)
        for li in soup.findAll('li'):
            try:
                li.contents[0].contents[0]
            except:
                continue
            title = li.contents[0].contents[0]
            #title = self.get_safe_utf8(title)
            href = li.contents[0]["href"]
            time = li.contents[1].strip()
            time = time.replace(u')',"")
            time = time.replace(u'(',"")
            #title = self.cleanHtmlTag(self.get_safe_utf8(title))
            if not href:continue
            if href in self.suburl.keys():continue
            href = self.normal_url(self.get_safe_utf8(urljoin(base, self.get_safe_utf8(href))))
            #self.suburl[href] = (title,time)
            if time == self.today:
                self.suburl[href] = (title,time)
            #print title 
            #print href
            #print time 
        return True

    def cleanHtmlAgain(self,value):
        regex1 = "<[\s\S]*?>"
        value = re.subn(regex1,"",value,re.M)
        return value[0]

    def cleanHtmlTag(self,html):
        html = html.strip()
        html = html.strip("\n")
        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(html)
        parser.close()
        res = ''.join(result)
        res = self.cleanHtmlAgain(res)
        return res

    def getSubUrl(self,baseurl):
        tmp = ""
        maxturnpage = 5
        regex = "\/[a-zA-Z0-9]+_[a-zA-Z0-9]+\.htm$"
        for i in range(1,maxturnpage):
            if(re.search(regex,baseurl)):
                regextmp = "\.htm$"
                tmp = re.sub(regextmp,"_" + str(i) + ".htm",baseurl)
            else:
                regexdel = "_\d?\.htm$"
                urltmp = re.sub(regexdel,"_" + str(i) + ".htm",baseurl)
                baseurl = urltmp
            html, redirect, code = self.downloader.fetch(self.get_safe_utf8(baseurl))
            if code == 200:
                html = self.detect_html(html)
                self.link_parse(html,redirect)
                print 'baseurl down succeed : %s' % baseurl
            baseurl = tmp
        return True

    def deleteDownloadedUrl(self):
        print "There are %s urls need to download!" % len(self.suburl.keys())
        if os.path.isfile(self.urllog):
            logfile = open(self.urllog)
            if logfile:
                for line in logfile.readlines():
                    line = line.strip()
                    if line in self.suburl.keys():
                        del self.suburl[line]
            else:
                print("Could not open the logfile : %s",self.urllog)
        else:
            print ("the logfile : " + self.urllog + " is not exists this time !")
        print "There are %s urls  REALLY need to download!" % len(self.suburl.keys())
        
    def downloadPages(self,enChannel,chChannel):
        enChannelpath = self.subpagepath + "/" + enChannel
        os.system("mkdir -p %s" % enChannelpath)
        num = 0
        for suburl in self.suburl.keys():
            title = self.suburl[suburl][0]
            pubtime = self.suburl[suburl][1]
            html, redirect, code = self.downloader.fetch(self.get_safe_utf8(suburl))
            if code == 200:
                print "suburl download succeed : %s" % suburl
                html = self.detect_html(html)
                subpagefile = enChannelpath + "/content_" + self.today +"_" + str(num) + ".html"
                num = num + 1
                try:
                    fileout = open(subpagefile,"w")
                    fileout.write(self.get_safe_utf8(html) + "\n")
                    fileout.close()
                except IOError, e:
                    sys.stderr.write("could not open the subpagefile : %s" + subpagefile)
                soup = BeautifulSoup(html)
                for div in soup.findAll("div",id="Zoom"):
                    content = self.cleanHtmlTag(str(div))

                inserttime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
                try:
                    title = self.get_safe_utf8(title)
                except:
                    title =title
                content = self.get_safe_utf8(content)
                html = self.get_safe_utf8(html)
                chChannel = self.get_safe_utf8(chChannel)
                suburl = self.get_safe_utf8(suburl)
                self.html2db.datainsert(title,content,html,chChannel,suburl,pubtime,inserttime)
            print title
            print suburl
            print pubtime
            #print content
            #print html
            print chChannel
            print inserttime 
            print "################################################################################"