Exemplo n.º 1
0
 def photobucket_callback(self, response):
     try:
         rss = feedparser.parse(response.body)
     except:
         return
     for entry in rss['entries']:
         self.urls.append( (URL.normalize(entry.guid), entry.title) )
Exemplo n.º 2
0
 def matched_feed(self, response):
     links = self.mario.link_title_db.dic
     for k, v in links.iteritems():
         link = URL.normalize(k)
         if link in (response.url, response.effective_url):
             return v[0][2]
     return None
Exemplo n.º 3
0
 def next_depth(self, response):
     #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None)
     for link, title in URL.link_title(response.body, response.effective_url):
         if not self.inject_url(link, response.args):continue
         self.link_title_db.add(link, response.effective_url, title)
     if callable(self.callback): self.callback(response)
     self.crawled[response.effective_url] = 2
     if response.effective_url != response.url:
         self.crawled[response.url] = 2
     self.referer = response.effective_url
Exemplo n.º 4
0
 def __init__(self, url, page=None, debug=False):
     self.url = URL.normalize(url)
     self.page = page
     if not page:
         mario = Mario()
         response = mario.get(self.url)
         if response and response.body:
             self.page = response.body
     self.debug = debug
     bsp = BSP()
     self.bsp_info = bsp.normalize(url)
Exemplo n.º 5
0
 def parser(self, html, sp, homepage):
     if not html: return None
     links = []
     if sp == 'baidu':
         pattern = re.compile('nameEnc: "([^^].*?)"')
         username = pattern.findall(html)
         if not username: return None
         link = 'http://frd.baidu.com/api/friend.getlist?un=%s'%username[0]
         mario = Mario()
         response = mario.get(link)
         if not response or not response.body: return None
         pattern = re.compile('\["([^^].*?)","[^^].*?","[^^].*?","[^^].*?",\d+,"[^^].*?",\d+,\d+\]')
         names = pattern.findall(response.body)
         if not names: return None
         bsp = BSP()
         for n in names:
             u = bsp.normalize('http://hi.baidu.com/sys/checkuser/%s'%n)
             if u and u[1] != homepage and u[1] not in links:
                 links.append(u)
     elif sp == 'sohu':
         pattern = re.compile('"link" : "([^^].*?)"', re.I)
         urls = pattern.findall(html)
         bsp = BSP()
         for url in urls:
             r = bsp.normalize(url)
             if r and r[1] != homepage and r[1] not in links:
                 links.append(r[1])
     elif sp == '163':
         pattern = re.compile('.userName="******"')
         usernames = pattern.findall(html)
         links = []
         bsp = BSP()
         for u in usernames:
             if not u: continue
             link = bsp.valid163(u, 'http:%s.blog.163.com/'%u, '163')
             if link and link[1] and link[1] not in links: links.append(link[1])
     else:
         bsp = BSP()
         for link, title in URL.link_title(html, homepage):
             if not link:
                 continue
             r = bsp.normalize(link)
             if r and r[1] != homepage and r[1] not in links:
                 links.append(r[1])
     return links
Exemplo n.º 6
0
 def flickr(self, flickr_api_key, depth=5):
     api_key = flickr_api_key
     total_pages = depth
     url_form = 'http://%(farm_id)s.static.flickr.com/%(server_id)s/%(id)s_%(secret)s_b.jpg'
     flickr = flickrapi.FlickrAPI(api_key)
     cur_page = 1
     while cur_page <= depth and cur_page <= total_pages:
         try:
             rsp = flickr.photos_search(text=self.keyword, media='photos', per_page='10', page=cur_page)
         except:
             total_pages=0
             logger.error(Traceback())
             continue
         total_pages = rsp[0].attrib['pages']
         photos = rsp.find('photos')
         for photo in photos:
             self.urls.append( (URL.normalize(url_form%{'farm_id':photo.attrib['farm'], 'server_id':photo.attrib['server'], 'id':photo.attrib['id'], 'secret':photo.attrib['secret']}), photo.attrib['title']) )
         cur_page += 1
Exemplo n.º 7
0
 def __init__(self, starturl, identifier=None, accept_url_patterns=[], reject_url_patterns=[], analysis=False, verbose=False):
     starturl = URL.normalize(starturl)
     self.analysis = analysis
     self.mixed = 0
     if not identifier: identifier = md5(starturl).hexdigest()
     super(Warehouse, self).__init__(starturl, identifier=identifier, accept_url_patterns=accept_url_patterns, reject_url_patterns=reject_url_patterns, analysis=analysis, verbose=verbose)
     bsp = BSP()
     bsp_pac = bsp.get_pac(starturl)
     pac = None
     if bsp_pac:
         pac = bsp_pac
     if not Site().one({"url_hash": identifier}):
         site = New(Site())
         site.url = starturl if isinstance(starturl, unicode) else starturl.decode('utf-8')
         site.url_hash = identifier if isinstance(identifier, unicode) else identifier.decode('utf-8')
         site.inserted_at = datetime.utcnow()
         site.last_updated_at = datetime.utcnow()
         site.save()
Exemplo n.º 8
0
 def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None):
     self.concount = concount
     self.callback = callback
     self.callpre = callpre
     self.callfail = callfail
     self.depth = depth
     self.starturl = starturl
     self.baseurl = URL.baseurl(starturl)
     self.urls = []
     self.crawled = {}
     self.link_title_db = LinkTitleDB()
     self.accept_url_patterns = accept_url_patterns
     self.reject_url_patterns = reject_url_patterns
     self.robotstxt = RobotFileParser()
     self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
     self.referer = starturl
     try:
         self.robotstxt.read()
     except:
         logger.debug(Traceback())
Exemplo n.º 9
0
 def normalize(self, url):
     url = URL.normalize(url)
     tmp = url.split('?')
     for b in self.support_bsps:
         pattern = re.compile(b[0], re.I)
         res = pattern.findall(url)
         if res:
             name = self.normalizeName(res[0])
             if len(tmp)>1 and b[2] in tmp[1] and 'http' in tmp[1] or name in b[1]: continue
             if b[2] == 'tianya': return self.validTianya(name, b[2])
             if b[2] == 'ycool': return self.validYcool(name, b[2])
             if b[2] == 'blogcn': return self.validBlogcn(name, b[2])
             if b[2] == '163': return self.valid163(name, url, b[2])
             if b[2] == 'cnblogs': return self.validCnblogs(name, b[2])
             if b[2] == 'sina': return self.validSina(url, b[2])
             if b[2] == 'live': return self.validLive(name, url, b[2])
             if b[2] == 'blogbus': return self.validBlogbus(name, b[2])
             if b[2] == 'baidu': return self.validBaidu(name, url, b[2])
             if b[2] == 'hexun': return self.validHexun(name, b[2])
             if b[2] == 'sohu': return self.validSohu(name, b[2])
             if b[2] == 'mop': return self.validMop(name, b[2])
     return None
Exemplo n.º 10
0
    def _handle_response_header(self, c):
        """Handle the response.
        This method decodes the response to unicode and checks for any error
        condition.  It additionally adds a C{Statistics} item to the response
        which contains upload & download times.

        @type c: PycURL C{Curl}
        @param c: a completed connection
        @return: a dictionary of results corresponding to the response
        @raise MarioException: if an error exists in the response
        """

        code = c.getinfo(c.HTTP_CODE)
        if c.errstr() == '' and c.getinfo(pycurl.RESPONSE_CODE) in STATUS_OK or code == 200:
            effective_url = c.getinfo(pycurl.EFFECTIVE_URL)
            size = int(c.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD))
        else:
            if callable(self.callfail): self.callfail(c.url)
            raise HTTPException(c.errstr(), code)
            return None
        #if self.check_duplicate and URL.been_inserted(effective_url, self.lightcloud): return None
        return URL.normalize(effective_url)
Exemplo n.º 11
0
 def __init__(self, starturl, identifier=None, verbose=False):
     starturl = URL.normalize(starturl)
     self.mixed = 1
     if not identifier: identifier = md5(starturl).hexdigest()
     super(WarehouseRss, self).__init__(starturl, identifier=identifier, verbose=verbose)
Exemplo n.º 12
0
 def get(url, html):
     url = URL.normalize(url)
     bsp = BSP()
     bsp_info = bsp.normalize(url)
     if not bsp_info: return None
     username, homepage, sp = bsp_info
     mario = Mario()
     if sp == 'sohu':
         pattern = re.compile("var _ebi = '([^^].*?)'")
         res = pattern.findall(html)
         if not res: return None
         response = mario.get("http://blog.sohu.com/action/ebi_%s-m_view-type_profile/widget/"%res[0])
         if not response or not response.body: return None
         pattern = re.compile('<div id="profile_photo">[^^]*?<img src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == '163':
         pattern = re.compile("hostName     : '([^^].*?)'")
         hostName = pattern.findall(html)
         if hostName: hostName = hostName[0]
         pattern = re.compile("dataDigest	  : '([^^].*?)'")
         dataDigest = pattern.findall(html)
         if dataDigest: dataDigest = dataDigest[0]
         if not hostName or not dataDigest: return None
         response = mario.get('http://ud3.blog.163.com/%s/%s/modi=1208265646323&mid=0&tid=0&pdm=1/prev.js'%(hostName, dataDigest))
         if not response or not response.body: return None
         pattern = re.compile('<img class=[^^]*?src=[^^]*?"([^^].*?)"')
         res = pattern.findall(response.body)
         if res: return res[0][:-1]
         response = mario.get('http://blog.163.com/%s/profile/'%hostName)
         if not response or not response.body: return None
         pattern = re.compile('<img class="bd01 g_img_00 g_c_hand" src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'blogcn':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('var[^^]*?blogusername="******"')
         res = pattern.findall(response.body)
         if not res:return None
         response = mario.get('http://userinfo.blogcn.com/%s.shtml'%res[0])
         if not response or not response.body: return None
         pattern = re.compile('<img class="top-5px" src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'ycool':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<a href="http://www.ycool.com/space.php?uid=([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         return 'http://ug.ycstatic.com/avatar/%sx96.jpg'%res[0]
     elif sp == 'hexun':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div id="master_ptoto_1">[^^]*?<script src=\'([^^].*?)\'>')
         res = pattern.findall(response.body)
         if not res:return None
         response = mario.get(res[0])
         if not response or not response.body: return None
         pattern = re.compile("<img src='([^^].*?)'")
         res = pattern.findall(response.body)
         if not res:return None
         return res[0]
     elif sp == 'live':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         response = mario.get(urljoin(homepage, 'recent/'))
         if not response or not response.body: return None
         pattern = re.compile('<div class="cxp_ic_tile_clip"[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         return res[0]
     elif sp == 'blogbus':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<img class="avatar" src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res:return None
         return res[0]
     elif sp == 'sina':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div id="userImage">[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if res: return res[0]
         pattern = re.compile('<div class="image">[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'tianya':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<BloggerMemsList>[^^]*?<a href="http://www.tianya.cn/browse/listwriter.asp\?vwriter=([^^].*?)&idWriter=0&Key=0"[^^]*?</a>')
         res = pattern.findall(response.body)
         if not res: return None
         response = mario.get('http://my.tianya.cn/mytianya/ListWriterNew.asp?vwriter=%s'%res[0])
         if not response or not response.body: return None
         pattern = re.compile('<img onload="[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'baidu':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div class="portrait">[^^]*?<img src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
     elif sp == 'mop':
         response = mario.get(homepage)
         if not response or not response.body: return None
         pattern = re.compile('<div[^^]*?class="fava_box"[^^]*?<img[^^]*?src="([^^].*?)"')
         res = pattern.findall(response.body)
         if not res: return None
         return res[0]
Exemplo n.º 13
0
 def reject_url(self, url):
     return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
     
Exemplo n.º 14
0
 def get_rss_url(self, starturl, etag=None, last_modified=None, proxy=None):
     mario = Mario(referer=starturl, etag=etag, last_modified=last_modified, proxy=proxy)
     response = mario.get(starturl)
     if not response: return None
     return URL.rss_link(starturl, response.body)
Exemplo n.º 15
0
 def connect(self, url, body=None, headers=HEADERS, normalize=True, args=None):
     url = URL.normalize(url, normalize)
     #if self.check_duplicate and URL.been_inserted(url, self.lightcloud): return None
     if callable(self.callpre): self.callpre(url)
     c = pycurl.Curl()
     if headers:
         if self.user_agent:
             headers.setdefault('User-Agent', self.user_agent)
         else:
             headers.setdefault('User-Agent', self.random_user_agent())
         header_list = []
         for header_name, header_value in headers.iteritems():
             header_list.append('%s: %s' % (header_name, header_value))
         if self.last_modified:
             header_list.append('%s: %s' % ('If-Modified-Since', self.last_modified))
         if self.etag:
             header_list.append('%s: %s' % ('ETag', self.etag))
         if header_list:
             c.setopt(pycurl.HTTPHEADER, header_list)
     #c.setopt(c.USERAGENT, self.user_agent)
     # Presence of a body indicates that we should do a POST
     if self.post_body: body = self.post_body
     if self.login: body = self.login
     if body is not None:
         logger.debug('post')
         body = urlencode(body)
         c.setopt(pycurl.POST, 1)
         c.setopt(pycurl.POSTFIELDS, body)
     else:
         c.setopt(pycurl.HTTPGET, 1)
     c.url = url
     c.args = args
     c.setopt(pycurl.ENCODING, 'gzip, deflate')
     c.setopt(pycurl.FOLLOWLOCATION, 1) 
     c.setopt(pycurl.MAXREDIRS, 10) 
     c.setopt(pycurl.CONNECTTIMEOUT, 30) 
     c.setopt(pycurl.TIMEOUT, self.timeout) 
     c.setopt(pycurl.NOSIGNAL, 1)
     c.response = StringIO()
     c.header_data = StringIO()
     c.setopt(pycurl.WRITEFUNCTION, c.response.write)
     c.setopt(pycurl.HEADERFUNCTION, c.header_data.write)
     try:
         c.setopt(pycurl.URL, URL.quote(url))
     except:
         return None
     if self.cookies: cookies = self.cookies
     else: cookies = self.parse_cookies(c)
     if cookies:
         c.setopt(pycurl.COOKIELIST, '')
         chunks = []
         for key, value in cookies.iteritems():
             key = quote_plus(key)
             value = quote_plus(value)
             chunks.append('%s=%s;' % (key, value))
         c.setopt(pycurl.COOKIE, ''.join(chunks))
     else:
         cookie_file_name = os.tempnam()
         c.setopt(pycurl.COOKIEFILE, cookie_file_name)
         c.setopt(pycurl.COOKIEJAR, cookie_file_name)
     
     if self.referer:
         c.setopt(pycurl.REFERER, self.referer)
         
     if self.verbose:
         c.setopt(pycurl.VERBOSE, True)
         c.setopt(pycurl.DEBUGFUNCTION, self.verbose)
     
     if self.progress:
         c.setopt(pycurl.NOPROGRESS, False)
         c.setopt(pycurl.PROGRESSFUNCTION, self.progress)
     
     if self.proxies: self.proxy = random.choice(self.proxies)
     
     if self.proxy:
         if isinstance(self.proxy, (str, unicode)): proxy = self.proxy
         else: 
             proxy = self.proxy['url']
             if 'userpwd' in self.proxy:
                 c.setopt(pycurl.PROXYUSERPWD, self.proxy['proxy_userpwd'])
             if 'type' in self.proxy:
                 ptype = getattr(pycurl, 'PROXYTYPE_%s' % self.proxy['type'].upper())
                 c.setopt(pycurl.PROXYTYPE, ptype)
         c.setopt(pycurl.PROXY, proxy)
     
     if not self.secure:
         c.setopt(pycurl.SSL_VERIFYPEER, False)
         c.setopt(pycurl.SSL_VERIFYHOST, False) 
     logger.debug('connected to %r'%url)
     return c
Exemplo n.º 16
0
                body = body.decode(encoding).encode('utf-8')
            elif charset and charset['encoding'] and charset['encoding'].lower()!='iso-8859-2':
                pattern = re.compile('<meta http-equiv="Content-Type" content="text/html; charset=([^^].*?)"', re.I|re.S)
                encoding = pattern.findall(body)
                if encoding:
                    encoding = encoding[0].lower()
                    if encoding in ALT_CODECS: encoding = ALT_CODECS[encoding]
                    if encoding.lower()!='iso-8859-2' and encoding.lower()!='utf-8':
                        body = body.decode(encoding).encode('utf-8')
        except UnicodeDecodeError, err:
            body = body.decode(encoding, "replace").encode('utf-8')
            #if callable(self.callfail): self.callfail(effective_url)
            #logger.error('Encoding error: %r'%c.url)
            #logger.error(err)
            #return None
        response = HTTPResponse(url=c.url, effective_url=URL.normalize(effective_url), size=size, code=code, body=body, etag = Etag, last_modified = Last_Modified, args=c.args)
        logger.debug(response)
        try:
            if callable(self.callback): self.callback(response)
            return response
        except:
            if callable(self.callfail): self.callfail(effective_url)
            logger.error('Error: %r'%Traceback())
            return None
    
    def _handle_response_header(self, c):
        """Handle the response.
        This method decodes the response to unicode and checks for any error
        condition.  It additionally adds a C{Statistics} item to the response
        which contains upload & download times.