示例#1
0
    def _make_img_urls(self, product_key, img_count):
        """
        the keyworld `RLLZ` in url  meaning large size(about 800*1000), `RLLD` meaning small size (about 400 *500)
        http://www.ruelala.com/images/product/131385/1313856984_RLLZ_1.jpg
        http://www.ruelala.com/images/product/131385/1313856984_RLLZ_2.jpg

        http://www.ruelala.com/images/product/131385/1313856984_RLLZ_1.jpg
        http://www.ruelala.com/images/product/131385/1313856984_RLLZ_2.jpg
        """
        urls = []
        prefix = 'http://www.ruelala.com/images/product/'
        for i in range(0, img_count):
            subfix = '%s/%s_RLLZ_%d.jpg' %(product_key[:6], product_key, i+1)
            url = urllib.basejoin(prefix, subfix)
            urls.append(url)

        # num_image_urls() if return 0, means RLLZ and RLLA is not work, use RLLDE instead.
        if img_count == 0:
            for j in xrange(0, 1000):
                sub = '%s/%s_RLLDE_%d.jpg' %(product_key[:6], product_key, j+1)
                url = urllib.basejoin(prefix, sub)
                status = self.net.fetch_image(url)
                if status != 404:
                    urls.append(url)
                else:
                    return urls
        return urls
示例#2
0
    def __init__(self):
        baseurl = 'http://164.100.47.132/LssNew/psearch/'
        date2num = {\
            (datetime.date(1998, 03, 23),      \
             datetime.date(1999, 04, 24)): 12, \
            (datetime.date(1999, 10, 20),      \
             datetime.date(2004, 02, 05)): 13, \
            (datetime.date(2004, 06, 02),      \
             datetime.date(2009, 02, 26)): 14, \
            (datetime.date(2009, 06, 01),      \
             datetime.date(2014, 06, 01)): 15, \
          }
        num2webform = { 12: 'DebateAdvSearch12.aspx', \
                        13: 'DebateAdvSearch13.aspx', \
                        14: 'DebateAdvSearch14.aspx', \
                        15: 'DebateAdvSearch15.aspx', \
                      }

        num2dateqry = { 12: 'DebateAdvSearch12.aspx', \
                        13: 'DebateAdvSearch13.aspx', \
                        14: 'DebateAdvSearch14.aspx', \
                        15: 'DebateAdvSearch15.aspx', \
                      }
        self.webformUrls = {}
        for k in date2num.keys():
            self.webformUrls[k] = urllib.basejoin(baseurl, \
                                                  num2webform[date2num[k]]) 
        self.dateqryUrls = {}
        for k in date2num.keys():
            self.dateqryUrls[k] = urllib.basejoin(baseurl, \
                                                  num2dateqry[date2num[k]]) 
示例#3
0
文件: SUMO.py 项目: Mektub/hconfig
 def _get_magnet(self,url):
    i=len(url)-1
    while url[i]!='/':
       i-=1
    url=url[:i+1]+urllib.quote_plus(url[i+1:])
    c=httplib2.Http()
    resp,content=c.request(url)
    if "set-cookie" in resp:
       cookie=resp['set-cookie']
    else:
       cookie=None
    tree=libxml2.htmlParseDoc(content,"utf-8")
    form=htmltools.find_elements(tree.getRootElement(),"form",id="frmAdultDisclaimer")
    if form:
       form=form[0]
       inputs=htmltools.find_elements(form,"input")
       body={}
       for i in inputs:
          body[i.prop('name')]=i.prop('value')
       del body['btn_Decline']
       body=urllib.urlencode(body)
       headers={'Content-type':"application/x-www-form-urlencoded"}
       if cookie:
          headers['Cookie']=cookie
       url=urllib.basejoin(url,form.prop('action'))
       resp,content=c.request(url,"POST",body,headers)
       if "set-cookie" in resp:
          cookie=resp['set-cookie']
       if cookie:
          headers['Cookie']=cookie
       url=urllib.basejoin(url,resp["location"])
       resp,content=c.request(url,headers=headers)
       tree=libxml2.htmlParseDoc(content,"utf-8")
    return htmltools.find_elements(tree.getRootElement(),"a",**{'class':'dwld_links'})[0].prop('href')
示例#4
0
文件: BTSCENE.py 项目: Mektub/hconfig
   def _parseLinks(self,url):

      c=httplib2.Http()

      resp,content=c.request(url)

      tree=libxml2.htmlParseDoc(content,"utf-8")

      links=htmltools.find_elements(tree.getRootElement(),"a")

      reflink=""

      magnet=None

      for i in links:

         if i.getContent().lstrip().rstrip()=="Download torrent":

            reflink=urllib.basejoin(url,i.prop('href'))

         if i.getContent().lstrip().rstrip()=="magnet link":

            magnet=urllib.basejoin(url,i.prop('href'))

            if "&" in magnet:

               j=magnet.index("&")

               magnet=magnet[:j]

      return reflink,magnet
示例#5
0
    def search(self, pattern=''):
        pattern = urllib.quote(pattern)

        url = '?s=%(pattern)s' % {'pattern': pattern}

        search = []

        search_elem = self.get_html_tree(url)

        if not search_elem or search_elem.find('div', {'class': 'alert alert-warning'}):
            # Sorry, no results were found.
            return search

        div_elems = search_elem.findAll(
            'div', {'class': 'col-lg-3 col-xs-3 col-sm-3 item'})
        for div_elem in div_elems:
            a_elem = div_elem.findAll('a')[-1]
            img_elem = div_elem.find('img')

            tv_show = re.sub(urllib.basejoin(self.main_url, 'watch-'), '',
                             a_elem.get('href'))

            item = {'label': a_elem.getText(),
                    'tv_show': tv_show,
                    'thumbnail': urllib.basejoin(self.main_url, img_elem.get('src'))}

            search.append(item)

        return search
    def __iter__(self):
        if self.target:
            basepath = xmlrpclib.ServerProxy(self.target).getPhysicalPath()
        for item in self.previous:
            if not self.target:
                yield item
                continue
            keys = item.keys()
            type_, path = item.get(self.typekey(*keys)[0]), item.get(self.pathkey(*keys)[0])
            
            if not (type_ and path):             # not enough info
                yield item; continue



            #fti = self.ttool.getTypeInfo(type_)
            #if fti is None:                           # not an existing type
            #    msg = "constructor: no type found %s:%s" % (type_,path)
            #    logger.log(logging.ERROR, msg)
            #    yield item; continue

            elems = path.strip('/').rsplit('/', 1)
            
            for attempt in range(0, 3):
                try:
                
                    url = urllib.basejoin(self.target, path)
                    proxy = xmlrpclib.ServerProxy(url)
                    container, id = (len(elems) == 1 and ('', elems[0]) or elems)
                    #if id == 'index.html':
                    try:
                        #test paths in case of acquition
                        rpath = proxy.getPhysicalPath()
                        #TODO: should check type to see if it's correct?
                        rpath = rpath[len(basepath):]
                        if path == '/'.join(rpath):
                            self.logger.debug("%s already exists. Not creating"% ('/'.join(rpath)) )
                            break
                    except xmlrpclib.Fault:
                        # Doesn't already exist
                        pass
                    purl = urllib.basejoin(self.target,container)
                    pproxy = xmlrpclib.ServerProxy(purl)
                    try:
                        pproxy.invokeFactory(type_, id)
                        self.logger.info("%s Created with type=%s"% (path, type_) )
                    except xmlrpclib.ProtocolError,e:
                        if e.errcode == 302:
                            pass
                        else:
                            raise
                    except xmlrpclib.Fault:
                        self.logger.warning("Failure while creating '%s' of type '%s'"% (path, type_) )
                        pass
                    break
                except xmlrpclib.ProtocolError,e:
                    if e.errcode == 503:
                        continue
                    else:
                        raise
示例#7
0
 def _parse_result(self, page_url, result_line):
    
    torrent_link, category, title, size, seeders, leechers, health = TorrentSearch.htmltools.find_elements(result_line, "td")
    torrent_url = urllib.basejoin(page_url, TorrentSearch.htmltools.find_elements(torrent_link, "a")[0].prop('href').replace('/torrent_download/','/download/'))
    if len(TorrentSearch.htmltools.find_elements(title, "a"))==2:
       details_link = TorrentSearch.htmltools.find_elements(title, "a")[0]
    else:
       details_link = TorrentSearch.htmltools.find_elements(title, "a")[1]
    title = details_link.getContent()
    details_link = urllib.basejoin(page_url, details_link.prop('href'))
    size=size.getContent()
    size=size[:-4]+" "+size[-2:]
    seeders=eval(seeders.getContent())
    leechers=eval(leechers.getContent())
    
    category=self._parse_category(TorrentSearch.htmltools.find_elements(category, "a")[0].prop('href').split('/')[-2])
    
    c=httplib2.Http()
    resp,content=self.http_queue_request(details_link)
    tree=libxml2.htmlParseDoc(content,"utf-8")
    lines=TorrentSearch.htmltools.find_elements(TorrentSearch.htmltools.find_elements(tree, "td", **{'class':'tabledata0'})[0].parent.parent,"tr")
    for i in lines:
       cells=TorrentSearch.htmltools.find_elements(i, "td")
       if cells[0].getContent()=="Info hash:":
          hashvalue=cells[1].getContent()
       elif cells[0].getContent()=="Torrent added:":
          date=cells[1].getContent().split(" ")[0]
          date=time.strptime(date,"%Y-%m-%d")
          date=datetime.date(date.tm_year, date.tm_mon, date.tm_mday)
    
    self.add_result(ExtraTorrentPluginResult(title, date, size, seeders, leechers, torrent_url, hashvalue, category))
示例#8
0
def paso_a_paso():
    url = "%s/tips/recetas" % BASE_URL
    html = urllib.urlopen(url).read()
    dom = lxml.html.document_fromstring(html)
    answer = []
    serie = models.Serie()
    serie.title = 'Paso a paso'
    serie.description = "por Martiniano Molina"
    serie.url = 'rss://%s/content/elgourmet/paso_a_paso' % settings.MY_BASE_URL
    serie.thumbnail = dom.cssselect("#cab_logo img")[0].get("src")
    serie.episodes = []
    serie.show_name = 'paso_a_paso'
    for a in dom.cssselect("#contenedor a"):
        try:
            url2 = a.get('href')
            if not url2.startswith('receta'): continue
            url2 = urllib.basejoin(BASE_URL, url2)

            episode = models.Episode()
            episode.title = a.cssselect("h2")[0].text_content()
            print "\t%s" % episode.title
            html2 = urllib.urlopen(url2).read()
            episode.url = url2
            episode.thumbnail = urllib.basejoin(BASE_URL, dom.cssselect("img")[0].get('src'))
            x = re.findall('"file": ?"(.*?)"', html2)
            episode.video_url = get_video_url(x[0], STREAMING_URL)
            serie.episodes.append(episode)
        except Exception,e:
            print "Error: %s" % e
def findVideoFrameLink(page, data):
    
    minheight=300
    minwidth=300
    
    frames = findFrames(data)
    if not frames:
        return None
    
    iframes = re.findall(data, "(frame[^>]* height=[\"']*(\d+)[\"']*[^>]*>)")

    if iframes:
        for iframe in iframes:

            height = int(iframe[1])
            if height > minheight:
                m = re.findall(iframe[0], "[\"' ]width=[\"']*(\d+[%]*)[\"']*")
                if m:
                    if m[0] == '100%':
                        width = minwidth+1
                    else:
                        width = int(m[0])
                    if width > minwidth:
                        m = re.findall(iframe[0], '[\'"\s]src=["\']*\s*([^"\' ]+)\s*["\']*')
                        if m:
                            link = m[0]
                            if not link.startswith('http://'):
                                up = urlparse.urlparse(urllib.unquote(page))
                                if link.startswith('/'):
                                    link = urllib.basejoin(up[0] + '://' + up[1],link)
                                else:
                                    link = urllib.basejoin(up[0] + '://' + up[1] + '/' + up[2],link)
                            return link.strip()

    # Alternative 1
    iframes = re.findall(data, "(frame[^>]*[\"; ]height:\s*(\d+)[^>]*>)")
    if iframes:
        for iframe in iframes:
            height = int(iframe[1])
            if height > minheight:
                m = re.findall(iframe[0], "[\"; ]width:\s*(\d+)")
                if m:
                    width = int(m[0])
                    if width > minwidth:
                        m = re.findall(iframe[0], '[ ]src=["\']*\s*([^"\' ]+)\s*["\']*')
                        if m:
                            link = m[0]
                            if not link.startswith('http://'):
                                link = urllib.basejoin(page,link)
                            return link.strip()

    # Alternative 2 (Frameset)
    iframes = re.findall(data, '<FRAMESET[^>]+100%[^>]+>\s*<FRAME[^>]+src="([^"]+)"')
    if iframes:
        link = iframes[0]
        if not link.startswith('http://'):
            link = urllib.basejoin(page,link)
        return link.strip()
        
    return None
示例#10
0
 def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False):
     utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta)
     self.cookiefile  = tempfile.NamedTemporaryFile()
     self.baseurl = 'http://ldemo.mp.nic.in'
     self.cookieurl = urllib.basejoin(self.baseurl, \
                                      'causelist/ciskiosk/ordermain.php')
     self.dateurl = urllib.basejoin(self.baseurl, \
                             '/causelist/ciskiosk/order_action.php?as9=ok3')
示例#11
0
 def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False):
     lobis.Lobis.__init__(self, name, rawdir, metadir, statsdir, updateMeta)
     self.baseurl   = 'http://lobis.nic.in/'
     self.courturl  = urllib.basejoin(self.baseurl, '/phhc/')
     self.cookieurl = urllib.basejoin(self.baseurl, \
                                      '/phhc/juddt.php?scode=28')
     self.dateurl   = urllib.basejoin(self.baseurl, \
                                      '/phhc/juddt1.php?dc=28&fflag=1')
示例#12
0
 def _run_search(self,pattern,href=None,page=0):
    if href==None:
       href="http://mononoke-bt.org/browse2.php?search="+urllib.quote_plus(pattern)
    resp,content=self.http_queue_request(href,headers={'Cookie':self._app.parse_cookie(self.login_cookie)})
    tree=libxml2.htmlParseDoc(content,"utf-8")
    pager=htmltools.find_elements(tree.getRootElement(),"div",**{'class':'animecoversfan'})[0].parent.next
    try:
       data=htmltools.find_elements(pager,"b")[-1].getContent()
       i=len(data)-1
       while data[i] in "0123456789":
          i-=1
       self.results_count=eval(data[i+1:])
    except:
       pass
    restable=pager.next.next
    lines=htmltools.find_elements(restable,"tr",1)[1:-2]
    for i in lines:
       try:
          cells=htmltools.find_elements(i,"td")
          team, show, stype, name, torrent_link, nbfiles, nbcmt, rate, date, size, views, dl, seeders, leechers, ratio=cells
          link=htmltools.find_elements(name,"a")[0]
          label=link.getContent()
          link=urllib.basejoin(href,link.prop('href'))
          torrent_link=urllib.basejoin(href,htmltools.find_elements(torrent_link,"a")[0].prop('href'))+"&r=1"
          date=htmltools.find_elements(date,"nobr")[0].children.getContent()
          date=time.strptime(date,"%Y-%m-%d")
          date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday)
          strsize=""
          cell=size.children
          while cell:
             if cell.name=="text":
                if strsize:
                   strsize+=" "
                strsize+=cell.getContent().upper()
             cell=cell.next
          size=strsize.replace('O','B')
          seeders=eval(seeders.getContent())
          leechers=eval(leechers.getContent())
          resp,content=self.http_queue_request(link,headers={'Cookie':self._app.parse_cookie(self.login_cookie)})
          itemtree=libxml2.htmlParseDoc(content,"utf-8")
          tds=htmltools.find_elements(itemtree.getRootElement(),"td")
          hashvalue=None
          for j in tds:
             if j.getContent()=="Info hash":
                hashvalue=j.next.next.getContent()
          self.add_result(MononokeBTPluginResult(label,date,size,seeders,leechers,torrent_link,hashvalue))
       except:
          pass
       if self.stop_search:
          return
    if not self.stop_search:
       try:
          b=htmltools.find_elements(pager,"b")[-1]
          if b.parent.name=="a":
             url="http://mononoke-bt.org/browse2.php?search=%s&page=%d"%(urllib.quote_plus(pattern),page+1)
             self._run_search(pattern,url,page+1)
       except:
          pass
示例#13
0
    def download_oneday(self, relpath, dateobj):
        dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php')
        postdata = [('d1', dateobj.day), ('m1', dateobj.month),  \
                    ('y1', dateobj.year), ('d2', dateobj.day),   \
                    ('m2', dateobj.month), ('y2', dateobj.year), \
                    ('button', 'Submit')]

        webpage = self.download_url(dateurl, postdata = postdata)

        if not webpage:
            self.logger.warning(u'No webpage for %s date: %s' % \
                                 (dateurl, dateobj))
            return []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'HTML parsing failed for date: %s' %  dateobj)
            return []

        newdls = []

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            words = href.split('/')
            filename = words[-1]

            url = urllib.basejoin(dateurl, href)

            self.logger.info(u'link: %s title: %s' % (href, title))

            relurl = os.path.join (relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                webpage = self.download_url(url)

                if not webpage:
                    self.logger.warning(u'No webpage %s' % url)
                else:
                    utils.save_file(filepath, webpage)
                    self.logger.info(u'Saved %s' % url)
                    newdls.append(relurl)

            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                metainfo = self.get_meta_info(title, dateobj)
                if metainfo:
                    utils.print_tag_file(metapath, metainfo)

        return newdls     
    def __iter__(self):
    
        self.checkOptions()
                            
        for item in self.previous:
            if not self.target:
                yield item
                continue
            
            keys = item.keys()
            
            # Apply defaultMatcher() function to extract necessary data
            # 1) which item will be transitioned
            # 2) with which transition
            pathkey = self.pathkey(*keys)[0]
            transitionskey = self.transitionskey(*keys)[0]

            if not (pathkey and transitionskey): # not enough info
                yield item
                continue
            
            path, transitions = item[pathkey], item[transitionskey]
            if isinstance(transitions, basestring):
                transitions = (transitions,)
                            
            remote_url = urllib.basejoin(self.target, path)
            if not remote_url.endswith("/"):
                remote_url += "/"
                

            for transition in transitions:
    
                transition_trigger_url = urllib.basejoin(remote_url, "content_status_modify?workflow_action=" + transition)
                self.logger.info("%s performing transition '%s'" % (path, transition))
                
                from httplib import HTTPException
                
                try:
                
                    f= urllib.urlopen(transition_trigger_url)
                    data = f.read()
                    
                    # Use Plone not found page signature to detect bad URLs
                    if "Please double check the web address" in data:
                        import pdb ; pdb.set_trace()
                        raise RuntimeError("Bad remote URL:" + transition_trigger_url)

                except HTTPException, e:
                    # Other than HTTP 200 OK should end up here,
                    # unless URL is broken in which case Plone shows
                    # "Your content was not found page"
                    self.logger.error("fail")
                    msg = "Remote workflow transition failed %s->%s" %(path,transition)
                    self.logger.log(logging.ERROR, msg, exc_info=True)
            
            yield item
示例#15
0
 def _run_search(self,pattern,href=None):
    if href==None:
       href="http://www.torrent411.com/search/"+urllib.quote_plus(pattern)
    resp,content=self.http_queue_request(href)
    content=_codecs.utf_8_encode(_codecs.latin_1_decode(content)[0])[0]
    tree=libxml2.htmlParseDoc(content,"utf-8")
    pager=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(),"table",**{'class':'NB-frame'})[1],"p")[0]
    try:
       b=htmltools.find_elements(pager,"b")[-1]
       data=b.getContent()
       i=len(data)-1
       while data[i] in "012346789":
          i-=1
       self.results_count=eval(data[i+1:])
    except:
       pass
    restable=htmltools.find_elements(pager.next.next,"table")[0]
    restable=htmltools.find_elements(restable,"table")[1]
    body=htmltools.find_elements(restable,"tbody")[0]
    lines=htmltools.find_elements(body,"tr",1)
    for i in lines:
       try:
          cat,link,a,date,b,c,d,e,f,g,h,i,size,j,seeders,leechers=htmltools.find_elements(i,"td")
          date=date.getContent().replace(chr(194)+chr(160)+"at"+chr(194)+chr(160)," ")
          date=time.strptime(date,"%Y-%m-%d %H:%M:%S")
          date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday)
          size=size.getContent().replace(chr(194)+chr(160)," ")
          seeders=eval(seeders.getContent())
          leechers=eval(leechers.getContent())
          link=htmltools.find_elements(link,"a")[0]
          label=link.prop('title')
          link=urllib.basejoin("http://www.torrent411.com",link.prop('href'))
          resp,content=self.http_queue_request(link)
          content=_codecs.utf_8_encode(_codecs.latin_1_decode(content)[0])[0]
          itemtree=libxml2.htmlParseDoc(content,"utf-8")
          table=htmltools.find_elements(itemtree.getRootElement(),"table",**{'cellpadding':'3'})[1]
          desc,name,torrent,cat,siz,hashvalue=htmltools.find_elements(table,"tr")[:6]
          torrent=htmltools.find_elements(torrent,"a")[0].prop('href')
          hashvalue=htmltools.find_elements(hashvalue,"td")[1].getContent()
          self.add_result(Torrent411PluginResult(label,date,size,seeders,leechers,torrent,hashvalue))
       except:
          pass
       if self.stop_search:
          return
    if not self.stop_search:
       try:
          links=htmltools.find_elements(pager,"a")
          next_link=None
          for i in links:
             if i.getContent()=="Next"+chr(194)+chr(160)+">>":
                next_link=i
          if next_link:
             link=urllib.basejoin("http://www.torrent411.com",next_link.prop('href'))
             self._run_search(pattern,link)
       except:
          pass
示例#16
0
    def __init__(self, name, rawdir, metadir, statsdir, updateMeta=False):
        utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta)
        self.baseurl = "http://patnahighcourt.bih.nic.in"
        self.hostname = "patnahighcourt.bih.nic.in"
        self.dateurl = urllib.basejoin(self.baseurl, "/judgment/judgDateWise.aspx")
        self.formaction = "judgDateWise.aspx"

        self.cookiefile = tempfile.NamedTemporaryFile()
        self.cookieurl = urllib.basejoin(self.baseurl, "/judgment/default.aspx")
        self.download_url(self.cookieurl, savecookies=self.cookiefile.name)
示例#17
0
文件: cic.py 项目: edudemy/judis-re
    def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False):
        utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta)
        self.baseurl = 'http://rti.india.gov.in'

        self.dateurl = urllib.basejoin(self.baseurl, \
                                       '/decision_categorywise.php')
        self.posturl = self.dateurl
        self.resulturl = urllib.basejoin(self.dateurl, \
                                         '/result_decision_categorywise.php')
        self.cookiefile  = tempfile.NamedTemporaryFile()
示例#18
0
def get_all_mp3(url):
    '''get all mp3 from a url'''
    data = urllib2.urlopen(url).read()
    re_com = re.compile('http://.*?\.mp3')
    all = re_com.findall(data)
    re_com = re.compile('<a href=\"(.*?\.mp3)\"')
    ll = re_com.findall(data)
    for i in ll:
        if urllib.basejoin(url,i) not in all:
            all.append(urllib.basejoin(url,i))
    return list(set(all)) #删除重复歌曲
示例#19
0
 def scrap_serie(self, serie):
     url = serie.url
     html = urllib.urlopen(url).read()
     soup = BeautifulSoup(html, from_encoding='utf-8')
     videos = soup.find('article','videos-list fod')
     if videos is None:
         return
     videos = list(videos('li','video')) + list(videos('li','video last'))
     for li in videos:
         url = urllib.basejoin(self.BASE_URL, li.a.get('href'))
         thumbnail = urllib.basejoin(self.BASE_URL, li.img.get('src'))
         self.scrap_episode(serie, url, thumbnail)
示例#20
0
文件: bakabt.py 项目: Mektub/hconfig
 def _run_search(self,pattern, page_url=''):
    http=httplib2.Http()
    headers={'Cookie':self.login_cookie}
    if page_url=="":
       page_url="http://www.bakabt.com/browse.php?q="+urllib.quote(pattern)
    resp,content=http.request(page_url,headers=headers)
    tree=libxml2.htmlParseDoc(content,"utf-8")
    try:
       data=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(), "div", **{'class':'pager'})[0], "a")[-2].getContent()
       i=len(data)-1
       while i>=0 and data[i] in "0123456789":
          i-=1
       self.results_count=eval(data[i+1:])
    except:
       pass
    results_table=htmltools.find_elements(tree.getRootElement(),"table",**{'class':'torrents'})[0]
    lines=htmltools.find_elements(results_table,"tr")[1:]
    is_alt=False
    for i in range(len(lines)):
       try:
          line=lines[i]
          if "torrent_alt" in line.prop('class') and not is_alt:
             is_alt=True
             continue
          if not "torrent_alt" in line.prop('class'):
             is_alt=False
          
          cells=htmltools.find_elements(line,"td")
          if len(cells)==6:
             category, details, comments, date, size, transfers = cells
          else:
             details, comments, date, size, transfers = cells
          day,month,year=date.getContent().replace("'","").split(" ")
          day=eval(day)
          year=eval("20"+year)
          month=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'].index(month)+1
          date=datetime.date(year,month,day)
          seeders,leechers=htmltools.find_elements(transfers,"a")
          seeders=eval(seeders.getContent())
          leechers=eval(leechers.getContent())
          size=size.getContent()
          link=htmltools.find_elements(details,"a")[0]
          label=link.getContent()
          link=urllib.basejoin(page_url,link.prop('href'))
          self.add_result(BakaBTPluginResult(label,date,size,seeders,leechers,link))
       except:
          pass
       if self.stop_search:
          return
    if not self.stop_search:
       link=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(), "div", **{'class':'pager'})[0], "a")[-1]
       if link.prop('class')!='selected':
          self._run_search(pattern, urllib.basejoin(page_url, link.prop('href')))
示例#21
0
 def _run_search(self,pattern,href=None):
    if href==None:
       href="http://linuxtracker.org/index.php?page=torrents&search="+urllib.quote_plus(pattern)
    resp,content=self.http_queue_request(href)
    tree=libxml2.htmlParseDoc(content,"utf-8")
    try:
       pager=htmltools.find_elements(tree.getRootElement(),"form",name="change_page")[0]
       options=htmltools.find_elements(pager,"option")
       self.results_count=50*len(options)
    except:
       pager=None
       self.results_count=50
    restable=htmltools.find_elements(tree.getRootElement(),"table",**{'class':'lista'})[1]
    lines=htmltools.find_elements(restable,"tr")[1:]
    for i in lines:
       try:
          cat,link,torrent_link,date,seeders,leechers,a,b=htmltools.find_elements(i,"td")
          label=link.getContent()
          link=urllib.basejoin(href,htmltools.find_elements(link,"a")[0].prop('href'))
          torrent_link=urllib.basejoin(href,htmltools.find_elements(torrent_link,"a")[0].prop('href'))
          date=time.strptime(date.getContent(),"%d/%m/%Y")
          date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday)
          seeders=eval(seeders.getContent())
          leechers=eval(leechers.getContent())
          resp,content=self.http_queue_request(link)
          itemtree=libxml2.htmlParseDoc(content,"utf-8")
          table=htmltools.find_elements(itemtree.getRootElement(),"table",**{'class':'coltable'})[0]
          size=None
          hashvalue=None
          for td in htmltools.find_elements(table,"td"):
             if td.getContent()=="Size" and size==None:
                size=td.next.next.getContent()
             if td.getContent()=="Info Hash" and hashvalue==None:
                hashvalue=td.next.next.getContent()
          self.add_result(linuxTRACKERPluginResult(label,date,size,seeders,leechers,torrent_link,hashvalue))
       except:
          pass
       if self.stop_search:
          return
    if not self.stop_search:
       try:
          if pager:
             spans=htmltools.find_elements(pager,"span")
             i=0
             while i<len(spans) and spans[i].prop('class')!='pagercurrent':
                i+=1
             i+=1
             if i<len(spans):
                link=htmltools.find_elements(spans[i],"a")[0]
                link=urllib.basejoin(href,link.prop('href'))
                self._run_search(pattern,link)
       except:
          pass
示例#22
0
def urlMerge(params, src):
    paramArr = __parseParams(params)
    paramTrunk = paramArr[0].replace('%s', src).replace("\t","")
    paramFile= paramArr[1].replace('%s', src).replace("\t","")

    if not paramFile.startswith('http'):
        up = urlparse.urlparse(urllib.unquote(paramTrunk))
        if paramFile.startswith('/'):
            return urllib.basejoin(up[0] + '://' + up[1], paramFile)
        else:
            return urllib.basejoin(up[0] + '://' + up[1] + '/' + up[2],paramFile)
    return src
示例#23
0
def get_item_playable(idItem):
    urlToLoad = urllib.basejoin(VVVVID_BASE_URL, idItem + '/info')
    data = getJsonDataFromUrl(urlToLoad)
    info = data['data']
    itemPlayable = ItemPlayableChannel()
    itemPlayable.title = info['title']
    itemPlayable.thumb = urllib.basejoin(VVVVID_STATIC_URL, info['thumbnail']) + '|' + HEADERS_ENCODED
    itemPlayable.id = info['id']
    itemPlayable.show_id = info['show_id']
    itemPlayable.ondemand_type = info['ondemand_type']
    itemPlayable.show_type = info['show_type']
    itemPlayable = get_seasons_for_item(itemPlayable)
    return itemPlayable
    def __iter__(self):
        basepath = xmlrpclib.ServerProxy(self.target).getPhysicalPath()
        for item in self.previous:
            keys = item.keys()
            typekey = self.typekey(*keys)[0]
            pathkey = self.pathkey(*keys)[0]

            if not (typekey and pathkey):             # not enough info
                yield item; continue

            type_, path = item[typekey], item[pathkey]

            #fti = self.ttool.getTypeInfo(type_)
            #if fti is None:                           # not an existing type
            #    msg = "constructor: no type found %s:%s" % (type_,path)
            #    logger.log(logging.ERROR, msg)
            #    yield item; continue

            elems = path.strip('/').rsplit('/', 1)
            
            for attempt in range(0, 3):
                try:
                
                    url = urllib.basejoin(self.target, path)
                    proxy = xmlrpclib.ServerProxy(url)
                    container, id = (len(elems) == 1 and ('', elems[0]) or elems)
                    #if id == 'index.html':
                    try:
                        #test paths in case of acquition
                        rpath = proxy.getPhysicalPath()
                        rpath = rpath[len(basepath):]
                        if path == '/'.join(rpath):
                            break
                    except xmlrpclib.Fault:
                        pass
                    purl = urllib.basejoin(self.target,container)
                    pproxy = xmlrpclib.ServerProxy(purl)
                    try:
                        pproxy.invokeFactory(type_, id)
                    except xmlrpclib.ProtocolError,e:
                        if e.errcode == 302:
                            pass
                        else:
                            raise
                    break
                except xmlrpclib.ProtocolError,e:
                    if e.errcode == 503:
                        continue
                    else:
                        raise
示例#25
0
文件: kplot.py 项目: Cairnarvon/kplot
 def plot(self, filename=None, open=False):
     if isinstance(filename, str):
         if filename[-4:].lower() != '.png':
             filename += ".png"
         os.system('kst --png %s %s' % (filename,
                                        urllib.basejoin(self.db, 
                                                        str(self.dataset),
                                                        'kst')))
         if open:
             os.system('xdg-open %s &' % filename)
     else:
         os.system('kst %s &' % urllib.basejoin(self.db,
                                                str(self.dataset),
                                                'kst'))
示例#26
0
文件: music.py 项目: 20after4/Yaki
 def playset(self,request,response, resultset):
     try:
         streamerURL = urllib.basejoin(self.getWebApp().getURLprefix(), "streamer.sn")
         baseURL=self.getAppContext().streamURLbase or request.getBaseURL()
         reply = "#EXTM3U\n"
         for result in resultset:
             reply=reply + "#EXTINF:-1," + result[1] + '\n' + \
                    urllib.basejoin(baseURL,streamerURL+'?id='+`id(result[1])`)+'\n'
         # response.setContentType("audio/x-mpegurl")
         response.setContentType("audio/mpegurl")
         response.setContentLength(len(reply))
         response.getOutput().write(reply)
     except ValueError:
         response.sendError(501, "Invalid command args")
示例#27
0
 def add_css(self, css="", cache=True, vendor=False, **kwargs):
     if css.startswith('http'):
         path = css
     elif vendor:
         path = urllib.basejoin(options.vendor_css_root, css)
     else:
         path = urllib.basejoin(options.static_root, '%s/css/' % options.site_name)
         path = urllib.basejoin(path, css)
     cachestring = ('' if cache or not options.debug
                    else '?cacheid=%s' % CACHID)
     extra_params = ""
     for item in kwargs.iteritems():
         extra_params += '%s="%s" ' % item
     return """<link rel="stylesheet" href="%s%s" type="text/css" %s/>""" \
         % (path, cachestring, extra_params)
示例#28
0
    def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False):
        utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta)
        self.hostname = 'gujarathc-casestatus.nic.in'
        self.baseurl = 'http://gujarathc-casestatus.nic.in/'
        self.pageurl = urllib.basejoin(self.baseurl, \
                                       '/gujarathc/SearchHCJudge')
        self.caseurl = urllib.basejoin(self.baseurl, \
                                       '/gujarathc/GetOrderDateNew')
        self.orderurl = urllib.basejoin(self.baseurl, \
                                        '/gujarathc/OrderHistoryViewDownload')

        self.cookiefile  = tempfile.NamedTemporaryFile()

        self.download_url(self.baseurl, \
                          savecookies = self.cookiefile.name)
示例#29
0
文件: kplot.py 项目: Cairnarvon/kplot
 def __getattribute__(self, name):
     """
     Overwritten to ensure data is always synchronised with the DB.
     """
     if name == 'data':
         # Check if remote data is unaltered
         url = urllib.basejoin(self.db, "/%d/diff" % self.dataset)
         updated = int(urllib2.urlopen(url).read())
         if self.updated != updated:
             # Fetch remote data
             url = urllib.basejoin(self.db, "/%d/py" % self.dataset)
             data, _ = self.__prep_data(urllib2.urlopen(url).read())
             object.__setattr__(self, 'data', data)
             self.updated = updated
     return object.__getattribute__(self, name)
示例#30
0
    def parse_result_page(self, posturl, webpage, dateobj):
        judgments = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)
            return judgments

        # get judgments
        trs = d.findAll('tr')
        for tr in trs:
            judgment = {}
            metainfo = { 'date': utils.date_to_xml(dateobj)}

            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href and re.search('WebShowJudgment.do', href):
                    t = utils.get_tag_contents(link)
                    colon = t.find(':')
                    if colon:
                        title = t[colon+1:]
                        title = title.strip()
                        metainfo['title'] = title
                        reobj = re.search(' vs\. ', title, re.IGNORECASE)
                        if reobj:
                            metainfo['petitioner'] = title[:reobj.start()]
                            metainfo['respondent'] = title[reobj.end():]
                if href and re.search('WebDownloadJudgmentDocument.do', href):
                    judgment['link'] = urllib.basejoin(posturl, href)
 
            if judgment:
                judgment['metainfo'] = metainfo
                judgments.append(judgment)
        
        # next link
        links = d.findAll('a')
        for link in links: 
            t = utils.get_tag_contents(link)          
            if re.search('Next', t):
                href = link.get('href')
             
                if href:
                    judgment = {'link': urllib.basejoin(posturl, href)}
                    judgment['next'] = True
                   
                judgments.append(judgment)
 
        return judgments
示例#31
0
def get(query, relative, outdir, listonly=False):
    page = 1
    while 1:
        params = dict(
            q = query,
            type = "Code",
            p = page
        )
        r = requests.get(SEARCH, params=params)
        if is_last_page(r.content):
            print("** No more results")
            break
        for u in extract(r.content):
            ru = raw_url(u)
            if relative:
                ru = urllib.basejoin(ru, relative)
            if listonly:
                print(ru)
            else:
                fn = make_fname(u)
                outpath = os.path.join(outdir, fn)
                if os.path.exists(outpath):
                    print("Skipping ", fn)
                else:
                    ret = requests.get(ru)
                    if ret.status_code == 200:
                        print("Fetching ", ru)
                        f = open(outpath, "w")
                        f.write(ret.content)
                        f.close()
                    else:
                        print("Error", fn, ret.status_code)
        page += 1
示例#32
0
    def __call__(self, url, baseURL=None):
        """Load the given multi-value url and call callbacks

        url -- vrml97-style url (multi-value string)
        baseURL -- optional base url from which items in url will
            be resolved.  protofunctions.root(node).baseURI will
            give you the baseURL normally used for the given node.

        raises IOError on failure
        returns (successfulURL, filename, open_file, headers) on success

        headers will be None for local files
        """
        log.info("Loading: %s, %s", url, baseURL)
        if isinstance(url, (str, unicode)):
            url = [url]
        file = None
        for u in url:
            # get the "absolute" url
            if baseURL:
                u = urllib.basejoin(baseURL, u)
            resolvedURL, file, filename, headers = self.get(u)
            if file is not None and filename is not None:
                break
        if not file or not filename:
            raise IOError("""Unable to download url %s""" % url)
        return (resolvedURL, os.path.abspath(filename), file, headers)
示例#33
0
    def _downloadDecisions(self, soup):
        re_descPattern = re.compile(
            'Beslutsdatum: (\d+-\d+-\d+) Diarienummer: (.*)')
        for result in soup.first('div', {'class': 'SearchResult'}):
            if result.a['href']:
                url = urllib.basejoin("http://www.jo.se/", result.a['href'])
                # Seems to be a bug in BeautifulSoup - properly
                # escaped & entities are not de-escaped
                url = url.replace('&amp;', '&')
                desc = result.contents[-1].string
                m = re_descPattern.match(desc)
                beslutsdatum = m.group(1)
                id = m.group(2)
                filename = id.replace('/', '-') + ".html"

                resource = LegalSource.DownloadedResource(id)
                resource.url = url
                resource.localFile = filename
                log.info(u'Storing %s as %s' % (url, filename))
                Robot.Store(url, None,
                            self.dir + "/" + id.replace('/', '-') + ".html")
                resource.fetched = time.localtime()
                if id in self.ids:
                    log.warn(u'replacing URL of id %s to %s (was %s)' %
                             (id, url, self.ids[id].url))
                self.ids[id] = resource
示例#34
0
def parse(url):
    try:
        req = urllib2.Request(url)
        req.add_header('User-agent', 'Mozilla 5.10')
        content = urllib2.urlopen(req, timeout=2).read()
        content = unicode(content, 'utf-8')
        content = clean(content)
    except urllib2.URLError:
        return []
    tree = etree.HTML(content)
    seg_list = extract(
        post_clean(''.join(
            tree.xpath('//p//text()|//strong//text()|'
                       '//span//text()|//a//text()|//li//text()'))))
    log('URL: %-60s|' % url[:60], seg_list,
        'Remains:' + str(download_queue.qsize()))

    n_urls = map(lambda x: basejoin(url, x), tree.xpath('//a/@href'))
    n_urls = filter(lambda x: x.startswith('http'), n_urls)
    n_urls = filter(lambda x: x.split('.', 1)[1].startswith(domain), n_urls)
    ret_urls = []
    for url in n_urls:
        if r.get(url) is not None:
            continue
        else:
            r.set(url, 1)
            ret_urls.append(url)
    return ret_urls
示例#35
0
def print_rep_table(repository):
    print "<table class=table cellpadding=3 cellspacing=0><tr class=table_header>"
    for title in ["Name", "Size", "Type", "Time", "Info"]:
        print "<td>%s</td>" % (title)
    print "</tr>"

    odd = True
    for x in os.listdir(repository.path):
        _, ext = os.path.splitext(x)
        if ext in EXT_TO_SHOW:
            fullname = join(realpath(repository.path), x)
            fullurl = quote(basejoin(repository.url + "/", x), ":/")

            if odd: print "<tr class=odd_row>"
            else: print "<tr class=even_row>"
            odd = not odd
            name, info = get_info(fullname)
            info = provisioning.crlf_to_cr(info).replace("\n", "<br>")
            print '<td><a href="%s">%s</a></td>' % (fullurl, name)
            print "<td class=size_column>%s</td>" % (str(
                os.path.getsize(fullname)))
            print "<td class=type_column><em>%s</em></td>" % (
                ext)  # (provisioning.get_mime_type(x))
            print "<td class=time_column>%s</td>" % (time.ctime(
                os.path.getctime(fullname)))
            print "<td class=info_column>%s</td></tr>" % (info)
示例#36
0
文件: rss.py 项目: fogueri/frappe
def get_context(context):
    """generate rss feed"""

    host = get_request_site_address()

    blog_list = frappe.db.sql("""\
		select page_name as name, published_on, modified, title, content from `tabBlog Post` 
		where ifnull(published,0)=1
		order by published_on desc limit 20""",
                              as_dict=1)

    for blog in blog_list:
        blog_page = cstr(urllib.quote(blog.name.encode("utf-8"))) + ".html"
        blog.link = urllib.basejoin(host, blog_page)
        blog.content = escape_html(blog.content or "")

    if blog_list:
        modified = max((blog['modified'] for blog in blog_list))
    else:
        modified = now()

    ws = frappe.doc('Website Settings', 'Website Settings')

    context = {
        'title': ws.title_prefix,
        'description': ws.description or ((ws.title_prefix or "") + ' Blog'),
        'modified': modified,
        'items': blog_list,
        'link': host + '/blog'
    }

    # print context
    return context
示例#37
0
def process_dir_xml(repository):
    try:
        print provisioning.XML_HEADER
        print '<?xml version="1.0" encoding="utf-8"?>'
        print """<serverContent
            xmlns="http://sun.com/2006/provisioning"
            xmlns:dd="urn:oma:xml:dl:dd:2.0"
            xmlns:xsd="http://www.w3.org/2001/XMLSchema-instance"
            xsd:schemaLocation="http://sun.com/2006/provisioning servercontent.xsd">
            """

        for x in os.listdir(repository.path):
            _, ext = os.path.splitext(x)
            if ext in EXT_TO_SHOW:
                print """<dd:media xmlns="urn:oma:xml:dl:dd:2.0" DDVersion="2.0">
                    <product><mediaObject>"""
                fullname = join(realpath(repository.path), x)
                fullurl = quote(basejoin(repository.url + "/", x), ":/")
                name, info = get_info(fullname)
                print "<meta><name>%s</name></meta>" % (name)
                print "<size>%d</size>" % (os.path.getsize(fullname))
                print "<type>%s</type>" % (provisioning.get_mime_type(x))
                print "<objectID>%s</objectID>" % (fullurl)
                print "<objectURI><server>%s</server></objectURI>" % fullurl  # (cgi.escape(objectURI))
                print "</mediaObject></product></dd:media>"

        print "</serverContent>"
    except Exception, e:
        sys.stderr.write("%s, %s" % (Exception, e))
示例#38
0
    def process_judgment_page(self, relpath, url, dateobj):
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)
        if not webpage:
            self.logger.warning(u'Could not download %s' % url)
            return None

        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.warning(u'Could not parse %s' % url)
            return None

        metainfo = self.get_meta_info(d, dateobj)

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            action = self.action_on_link(href, title)
            newurl = urllib.basejoin(url, href)
            if action == 'save':
                self.logger.info(u'Downloading %s' % newurl)
                return self.get_judgment(relpath, newurl, title, metainfo)

        return None
示例#39
0
    def appendURL(self, url, included=0):
        """Append packages from the database with the given URL.
        Only the first database should specify included=0, so the
        global information (maintainer, description) get stored."""

        if url in self._urllist:
            return
        self._urllist.append(url)
        fp = urllib2.urlopen(url).fp
        plistdata = plistlib.Plist.fromFile(fp)
        # Test here for Pimp version, etc
        if included:
            version = plistdata.get('Version')
            if version and version > self._version:
                sys.stderr.write(
                    "Warning: included database %s is for pimp version %s\n" %
                    (url, version))
        else:
            self._version = plistdata.get('Version')
            if not self._version:
                sys.stderr.write(
                    "Warning: database has no Version information\n")
            elif self._version > PIMP_VERSION:
                sys.stderr.write(
                    "Warning: database version %s newer than pimp version %s\n"
                    % (self._version, PIMP_VERSION))
            self._maintainer = plistdata.get('Maintainer', '')
            self._description = plistdata.get('Description', '').strip()
            self._url = url
        self._appendPackages(plistdata['Packages'], url)
        others = plistdata.get('Include', [])
        for o in others:
            o = urllib.basejoin(url, o)
            self.appendURL(o, included=1)
示例#40
0
 def delete(self,
            node='',
            data='',
            level='dataset',
            rmSubscriptions='y',
            comments='',
            format='json',
            instance='prod'):
     name = "delete"
     if not (node and data):
         self.logger.error(name, "Need to pass both node and data")
         return 1, "Error"
     values = {
         'node': node,
         'data': data,
         'level': level,
         'rm_subscriptions': rmSubscriptions,
         'comments': comments
     }
     deleteURL = urllib.basejoin(self.phedexBase,
                                 "%s/%s/delete" % (format, instance))
     check, response = self.phedexCall(deleteURL, values)
     if check:
         self.logger.error(name, "Delete call failed")
         return 1, "ERROR - self.phedexCall with response: " + response
     return 0, response
示例#41
0
 def scrape(self):
     dic = BeautifulSoupScraper.scrape(self)
     text = dic.get('title', '') + '\n' + dic.get('text', '')
     images = dic.get('images', [])
     if type(images) != list: images = [images]
     images = [urllib.basejoin(self.url, i) for i in images]
     return text, images
示例#42
0
def grep_items(config, url=None, page=1):
    items = list()
    if not url:
        url = config['main_url']
    try:
        internet_lock.acquire()  # авито не любит одновременные запросы
        sleep(1)
        html = urllib.urlopen(url).read()
        internet_lock.release()
        if not html:
            raise IOError
    except IOError:
        print 'error while open url %s' % url
    else:
        bs = bs4.BeautifulSoup(html)
        if bs.find(
                'input',
            {'id': 'search'})['value'].lower() == config['search'].lower():
            # авито упрощает поисковый запрос, если ничего не найдено
            bs_items = bs.findAll('div', {'class': 'item'})
            for item in bs_items:
                items.append(Item(config, item))
            next_page = bs.find('a', {'class': 'pagination__page'},
                                text=u'\n Следующая страница →\n ')
            if next_page:
                items += grep_items(
                    config, urllib.basejoin(config['site'], next_page['href']),
                    page + 1)
    if page == 1:
        config['last_len'] = len(items)
        config['last_check'] = datetime.datetime.now()
        print_status()
    return items
示例#43
0
    def upload_scrawl_file(self, request):
        """
        功能:处理涂鸦文件上传
        """
        try:
            action = request.GET.get("action", "")
            form_name = self.get_action_form_name(action)
            content=request.POST.get(form_name)
            upload_file = ContentFile(base64.decodestring(content))
            
            scrawl_default_name = "{}.png".format(str(int(time.time()))) # 默认涂鸦文件命名规则
            store_path = self._get_upload_path(scrawl_default_name)
            _file_name, upload_file_suffix = os.path.splitext(store_path)
            upload_file.name = _file_name

            self.storage.save(store_path, upload_file)
            rst = {
                'state': 'SUCCESS',
                'url': urllib.basejoin(self.settings.TUEDITOR_MEDIA_URL, store_path),
                'original': upload_file.name,
                'type': upload_file_suffix.replace(".", ""),
                'size': upload_file.size,
            }
        except Exception,E:
            rst = {
                'state': "写入图片文件错误:%s" % E.message,
            }
示例#44
0
 def put(self, title, body, revision=None, comment='', format='json'):
     logger.info('[put] %s size: %d revision:%s comment:%s', title,
                 len(body), revision, comment)
     if revision is None:
         _resp, data = self.get(title)
         revision = data['revision']
     url = urllib.basejoin(self.baseurl, title)
     data = urllib.urlencode({
         'title': title,
         'body': body,
         'revision': revision,
         'comment': comment or self.DEFAULT_COMMENT
     })
     try:
         resp, content = self._request(url,
                                       format=format,
                                       method='PUT',
                                       body=data)
         # TODO: handle 406, 409
         try:
             content = json.loads(content)
         except Exception as e:
             logger.error('[put] json load error: %s', e)
         return resp, content
     except HTTPError as e:
         logger.error("[put] %d %s", e.code, e.msg)
         raise
示例#45
0
    def result_page(self, relpath, url, dateobj, linkdict):
        newdls = []
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title) or linkdict.has_key(href):
                self.logger.warning(u'Could not process %s' % link)
                continue

            linkdict[href] = 1

            action = self.action_on_link(href, title)
            self.logger.info(u'Action %s on link %s title %s' %\
                                     (action, href, title))       
            newurl = urllib.basejoin(url, href)
            if action == 'judgmentlink':
                relurl = self.process_judgment_page(relpath, newurl, dateobj)
                if relurl:
                    newdls.append(relurl)
                else:
                    self.logger.warning(u'Judgment link not working %s' % newurl)
            elif action == 'recurse':
                newdls.extend(self.result_page(relpath, newurl, dateobj, 
                                               linkdict))
           
        return newdls
示例#46
0
    def language_url(self, language):
        """Get the dump location for given language

        :param language:    ISO 631 language code
        :type language:     string
        """
        return urllib.basejoin(self._host, self.language_dir(language))
示例#47
0
 def __init__(self, srcdir, rawdir, metadir, statsdir, updateMeta = False):
     utils.BaseCourt.__init__(self, srcdir, rawdir, metadir, statsdir, updateMeta)
     self.baseurl  = 'http://judgmenthck.kar.nic.in'
     self.hostname  = 'judgmenthck.kar.nic.in'
     self.courturl = urllib.basejoin(self.baseurl, '/judgments/')
     self.cookiefile = tempfile.NamedTemporaryFile()
     self.get_cookies()
示例#48
0
    def upload_file(self, request):
        """
        功能:处理文件上传
        """
        action = request.GET.get("action", "")
        upload_form_name = self.get_action_form_name(action)
        upload_file = request.FILES.get(upload_form_name)
        upload_file_name, upload_file_suffix = os.path.splitext(upload_file.name) 

        if not self.is_size_allow(action, upload_file.size):
            return JsonResponse({"state":"文件尺寸不符合要求"})
        if not self.is_suffix_allow(action, upload_file_suffix):
            return JsonResponse({"state":"文件格式不符合要求"})

        store_path = self._get_upload_path(upload_file.name, action)
        self.storage.save(store_path, upload_file)

        rst = {
            'state': 'SUCCESS',
            'url': urllib.basejoin(self.settings.TUEDITOR_MEDIA_URL, store_path),
            'original': upload_file.name,
            'type': upload_file_suffix.replace(".", ""),
            'size': upload_file.size,
        }
        return JsonResponse(rst)
示例#49
0
def get_context(context):
	"""generate the sitemap XML"""
	host = get_request_site_address()
	links = []
	for page in get_pages():
		if not page.no_sitemap:
			links.append({
				"loc": urllib.basejoin(host, urllib.quote(page.name.encode("utf-8"))),
				"lastmod": "2014-01-01"
			})

	def add_links(doctype, condition_field, order_by):
		meta = frappe.get_meta(doctype)
		page_name = "page_name"
		condition = ""

		if meta.get_field("parent_website_route"):
			page_name = """concat(ifnull(parent_website_route, ""),
				if(ifnull(parent_website_route, "")="", "", "/"), page_name)"""
		if condition_field:
			condition ="where ifnull({0}, 0)=1".format(condition_field)

		for route in frappe.db.sql("select {0}, modified from `tab{1}` {2}".format(page_name,
			doctype, condition)):
			if route[0]:
				links.append({
					"loc": urllib.basejoin(host, urllib.quote(route[0].encode("utf-8"))),
					"lastmod": get_datetime(route[1]).strftime("%Y-%m-%d")
				})

	process_generators(add_links)

	return {"links":links}
示例#50
0
 def appendURL(self, url, included=0):
     if url in self._urllist:
         return
     self._urllist.append(url)
     fp = urllib2.urlopen(url).fp
     plistdata = plistlib.Plist.fromFile(fp)
     if included:
         version = plistdata.get('Version')
         if version and version > self._version:
             sys.stderr.write(
                 'Warning: included database %s is for pimp version %s\n' %
                 (url, version))
     else:
         self._version = plistdata.get('Version')
         if not self._version:
             sys.stderr.write(
                 'Warning: database has no Version information\n')
         elif self._version > PIMP_VERSION:
             sys.stderr.write(
                 'Warning: database version %s newer than pimp version %s\n'
                 % (self._version, PIMP_VERSION))
         self._maintainer = plistdata.get('Maintainer', '')
         self._description = plistdata.get('Description', '').strip()
         self._url = url
     self._appendPackages(plistdata['Packages'], url)
     others = plistdata.get('Include', [])
     for o in others:
         o = urllib.basejoin(url, o)
         self.appendURL(o, included=1)
示例#51
0
def get_imagelinks(url):
    """Given a URL, get all images linked to by the page at that URL."""
    # Check if BeautifulSoup is imported.
    if isinstance(BeautifulSoup, ImportError):
        raise BeautifulSoup

    links = []
    uo = URLopener()
    with uo.open(url) as f:
        soup = BeautifulSoup(f.read())

    if not shown:
        tagname = 'a'
    elif shown == 'just':
        tagname = 'img'
    else:
        tagname = ['a', 'img']

    for tag in soup.findAll(tagname):
        link = tag.get('src', tag.get('href', None))
        if link:
            ext = os.path.splitext(link)[1].lower().strip('.')
            if ext in fileformats:
                links.append(urllib.basejoin(url, link))
    return links
示例#52
0
 def redirect(self, URL, request, response):
     URL = urllib.basejoin(self.urlpattern, URL)  # make always absolute...
     if response.header_written or response.redirection_performed:
         del request, response
         raise RuntimeError(
             'can not redirect twice or when getOutput() has been called')
     request.server.redirect(URL, request, response)
示例#53
0
 def mkUrl(self, service):
     "Generate Safe Browsing API URL"
     url = urllib.basejoin(self.config['base_url'], service)
     query_params = '&'.join(
         ['%s=%s' % (k, v) for k, v in self.config['url_args'].items()])
     url = '%s?%s' % (url, query_params)
     return url
示例#54
0
 def __init__(self, name, rawdir, metadir, statsdir, updateMeta=False):
     utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir,
                              updateMeta)
     self.baseurl = 'http://jhr.nic.in'
     self.hostname = 'jhr.nic.in'
     self.dateurl = urllib.basejoin(self.baseurl,
                                    '/hcjudge/date_output.php')
示例#55
0
    def delete(self,
               node='',
               data='',
               level='dataset',
               rmSubscriptions='y',
               comments='',
               format='json',
               instance='prod'):
        """
        _delete_

        Set up subscription call to PhEDEx API.
        """
        if not (node and data):
            return 1, " Error - need to pass both node and data"
        values = {
            'node': node,
            'data': data,
            'level': level,
            'rm_subscriptions': rmSubscriptions,
            'comments': comments
        }
        deleteURL = urllib.basejoin(self.phedexBase,
                                    "%s/%s/delete" % (format, instance))
        check, response = self.phedexCall(deleteURL, values)
        if check:
            return 1, " Error - self.phedexCall with response: " + response
        return 0, response
示例#56
0
def get_url(uri=None, full_address=False):
    """get app url from request"""
    host_name = frappe.local.conf.host_name

    if not host_name:
        if hasattr(frappe.local, "request"
                   ) and frappe.local.request and frappe.local.request.host:
            protocol = 'https' == frappe.get_request_header(
                'X-Forwarded-Proto', "") and 'https://' or 'http://'
            host_name = protocol + frappe.local.request.host
        elif frappe.local.site:
            host_name = "http://{}".format(frappe.local.site)
        else:
            host_name = frappe.db.get_value("Website Settings",
                                            "Website Settings", "subdomain")
            if host_name and "http" not in host_name:
                host_name = "http://" + host_name

            if not host_name:
                host_name = "http://localhost"

    if not uri and full_address:
        uri = frappe.get_request_header("REQUEST_URI", "")

    url = urllib.basejoin(host_name, uri) if uri else host_name

    return url
示例#57
0
    def parse_extraordinary_webpage(self, d, dateobj, ex_url):
        minfos = []

        result_table = self.find_result_table(d)
        if result_table == None:
            self.loger.warn('Could not find result table for date %s', dateobj)
            return minfos

        order = None
        for tr in result_table.find_all('tr'):
            if not order:
                order = self.find_result_order(tr)
                continue

            link = tr.find('a')
            if link == None:
                continue

            metainfo = self.process_row(tr, order, dateobj)
            if metainfo:
                href = link.get('href')
                if href:
                    gzurl = urllib.basejoin(ex_url, href)
                    metainfo.set_url(gzurl)
                    minfos.append(metainfo)

        return minfos
示例#58
0
    def action_sync(self):
        domain = self.last_sync and [("create_date", ">", self.last_sync)
                                     ] or []
        partner_domain = self._get_partner_domain(domain)

        last_sync = self.last_sync
        for values in self.env["res.partner"].search_read(
                partner_domain,
            ["surname", "firstname", "email", "create_date"],
                order="create_date asc"):
            email = values["email"]
            if not email:
                continue

            data = {
                "ne": email,
                "nn": values["surname"],
                "ns": values["firstname"]
            }

            url = urllib.basejoin(
                self.url, "wp-content/plugins/newsletter-api/add.php?nk=%s" %
                self.api_key)
            res = requests.post(url, data=data)
            if res.ok:
                _logger.info("Registered newsletter for %s" % email)
                last_sync = max(last_sync, values["create_date"])
            else:
                _logger.error("Unable to register newsletter for %s" % email)
                _logger.error(res.text)

        self.last_sync = last_sync
示例#59
0
    def download_extraordinary(self, dls, relpath, dateobj):
        ex_url = urllib.basejoin(self.baseurl,
                                 self.extraordinary_url % dateobj.year)

        response = self.download_url(ex_url)
        if not response or not response.webpage:
            self.logger.warn(
                'Unable to download Extraordinary gazette for year %d',
                dateobj.year)
            return

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn(
                'Unable to parse Extraordinary gazette list for year %d',
                dateobj.year)
            return

        if dateobj.year == 2010:
            minfos = self.parse_listing_webpage(ex_url, d, dateobj, None,
                                                'Extraordinary')
        else:
            minfos = self.parse_extraordinary_webpage(d, dateobj, ex_url)

        self.download_metainfos(minfos, dls, relpath)
示例#60
0
 def cleanup_attrs(self, tag, attrs):
     new_attrs = []
     tag = string.lower(tag)
     if self._new_window and tag == "a":
         new_attrs.append(('target', '_blank'))
     for name, value in attrs:
         name = string.lower(name)
         if name[:2] == "on": continue  ## skip any javascript events
         if string.lower(value)[:11] == "javascript:": continue
         if self._map_urls and name in [
                 "action", "href", "src", "lowsrc", "background"
         ] and value[:4] == 'cid:':
             try:
                 value = self._map_urls[value[4:]]
             except KeyError:
                 pass
         else:
             if self._base and name in [
                     "action", "href", "src", "lowsrc", "background"
             ]:
                 value = basejoin(self._base, value)
             if name in ["action", "href", "src", "lowsrc", "background"]:
                 value = 'http://www.google.com/url?sa=D&q=%s' % (
                     neo_cgi.urlEscape(value))
         if self._new_window and tag == "a" and name == "target": continue
         new_attrs.append((name, value))
     return new_attrs