def _make_img_urls(self, product_key, img_count): """ the keyworld `RLLZ` in url meaning large size(about 800*1000), `RLLD` meaning small size (about 400 *500) http://www.ruelala.com/images/product/131385/1313856984_RLLZ_1.jpg http://www.ruelala.com/images/product/131385/1313856984_RLLZ_2.jpg http://www.ruelala.com/images/product/131385/1313856984_RLLZ_1.jpg http://www.ruelala.com/images/product/131385/1313856984_RLLZ_2.jpg """ urls = [] prefix = 'http://www.ruelala.com/images/product/' for i in range(0, img_count): subfix = '%s/%s_RLLZ_%d.jpg' %(product_key[:6], product_key, i+1) url = urllib.basejoin(prefix, subfix) urls.append(url) # num_image_urls() if return 0, means RLLZ and RLLA is not work, use RLLDE instead. if img_count == 0: for j in xrange(0, 1000): sub = '%s/%s_RLLDE_%d.jpg' %(product_key[:6], product_key, j+1) url = urllib.basejoin(prefix, sub) status = self.net.fetch_image(url) if status != 404: urls.append(url) else: return urls return urls
def __init__(self): baseurl = 'http://164.100.47.132/LssNew/psearch/' date2num = {\ (datetime.date(1998, 03, 23), \ datetime.date(1999, 04, 24)): 12, \ (datetime.date(1999, 10, 20), \ datetime.date(2004, 02, 05)): 13, \ (datetime.date(2004, 06, 02), \ datetime.date(2009, 02, 26)): 14, \ (datetime.date(2009, 06, 01), \ datetime.date(2014, 06, 01)): 15, \ } num2webform = { 12: 'DebateAdvSearch12.aspx', \ 13: 'DebateAdvSearch13.aspx', \ 14: 'DebateAdvSearch14.aspx', \ 15: 'DebateAdvSearch15.aspx', \ } num2dateqry = { 12: 'DebateAdvSearch12.aspx', \ 13: 'DebateAdvSearch13.aspx', \ 14: 'DebateAdvSearch14.aspx', \ 15: 'DebateAdvSearch15.aspx', \ } self.webformUrls = {} for k in date2num.keys(): self.webformUrls[k] = urllib.basejoin(baseurl, \ num2webform[date2num[k]]) self.dateqryUrls = {} for k in date2num.keys(): self.dateqryUrls[k] = urllib.basejoin(baseurl, \ num2dateqry[date2num[k]])
def _get_magnet(self,url): i=len(url)-1 while url[i]!='/': i-=1 url=url[:i+1]+urllib.quote_plus(url[i+1:]) c=httplib2.Http() resp,content=c.request(url) if "set-cookie" in resp: cookie=resp['set-cookie'] else: cookie=None tree=libxml2.htmlParseDoc(content,"utf-8") form=htmltools.find_elements(tree.getRootElement(),"form",id="frmAdultDisclaimer") if form: form=form[0] inputs=htmltools.find_elements(form,"input") body={} for i in inputs: body[i.prop('name')]=i.prop('value') del body['btn_Decline'] body=urllib.urlencode(body) headers={'Content-type':"application/x-www-form-urlencoded"} if cookie: headers['Cookie']=cookie url=urllib.basejoin(url,form.prop('action')) resp,content=c.request(url,"POST",body,headers) if "set-cookie" in resp: cookie=resp['set-cookie'] if cookie: headers['Cookie']=cookie url=urllib.basejoin(url,resp["location"]) resp,content=c.request(url,headers=headers) tree=libxml2.htmlParseDoc(content,"utf-8") return htmltools.find_elements(tree.getRootElement(),"a",**{'class':'dwld_links'})[0].prop('href')
def _parseLinks(self,url): c=httplib2.Http() resp,content=c.request(url) tree=libxml2.htmlParseDoc(content,"utf-8") links=htmltools.find_elements(tree.getRootElement(),"a") reflink="" magnet=None for i in links: if i.getContent().lstrip().rstrip()=="Download torrent": reflink=urllib.basejoin(url,i.prop('href')) if i.getContent().lstrip().rstrip()=="magnet link": magnet=urllib.basejoin(url,i.prop('href')) if "&" in magnet: j=magnet.index("&") magnet=magnet[:j] return reflink,magnet
def search(self, pattern=''): pattern = urllib.quote(pattern) url = '?s=%(pattern)s' % {'pattern': pattern} search = [] search_elem = self.get_html_tree(url) if not search_elem or search_elem.find('div', {'class': 'alert alert-warning'}): # Sorry, no results were found. return search div_elems = search_elem.findAll( 'div', {'class': 'col-lg-3 col-xs-3 col-sm-3 item'}) for div_elem in div_elems: a_elem = div_elem.findAll('a')[-1] img_elem = div_elem.find('img') tv_show = re.sub(urllib.basejoin(self.main_url, 'watch-'), '', a_elem.get('href')) item = {'label': a_elem.getText(), 'tv_show': tv_show, 'thumbnail': urllib.basejoin(self.main_url, img_elem.get('src'))} search.append(item) return search
def __iter__(self): if self.target: basepath = xmlrpclib.ServerProxy(self.target).getPhysicalPath() for item in self.previous: if not self.target: yield item continue keys = item.keys() type_, path = item.get(self.typekey(*keys)[0]), item.get(self.pathkey(*keys)[0]) if not (type_ and path): # not enough info yield item; continue #fti = self.ttool.getTypeInfo(type_) #if fti is None: # not an existing type # msg = "constructor: no type found %s:%s" % (type_,path) # logger.log(logging.ERROR, msg) # yield item; continue elems = path.strip('/').rsplit('/', 1) for attempt in range(0, 3): try: url = urllib.basejoin(self.target, path) proxy = xmlrpclib.ServerProxy(url) container, id = (len(elems) == 1 and ('', elems[0]) or elems) #if id == 'index.html': try: #test paths in case of acquition rpath = proxy.getPhysicalPath() #TODO: should check type to see if it's correct? rpath = rpath[len(basepath):] if path == '/'.join(rpath): self.logger.debug("%s already exists. Not creating"% ('/'.join(rpath)) ) break except xmlrpclib.Fault: # Doesn't already exist pass purl = urllib.basejoin(self.target,container) pproxy = xmlrpclib.ServerProxy(purl) try: pproxy.invokeFactory(type_, id) self.logger.info("%s Created with type=%s"% (path, type_) ) except xmlrpclib.ProtocolError,e: if e.errcode == 302: pass else: raise except xmlrpclib.Fault: self.logger.warning("Failure while creating '%s' of type '%s'"% (path, type_) ) pass break except xmlrpclib.ProtocolError,e: if e.errcode == 503: continue else: raise
def _parse_result(self, page_url, result_line): torrent_link, category, title, size, seeders, leechers, health = TorrentSearch.htmltools.find_elements(result_line, "td") torrent_url = urllib.basejoin(page_url, TorrentSearch.htmltools.find_elements(torrent_link, "a")[0].prop('href').replace('/torrent_download/','/download/')) if len(TorrentSearch.htmltools.find_elements(title, "a"))==2: details_link = TorrentSearch.htmltools.find_elements(title, "a")[0] else: details_link = TorrentSearch.htmltools.find_elements(title, "a")[1] title = details_link.getContent() details_link = urllib.basejoin(page_url, details_link.prop('href')) size=size.getContent() size=size[:-4]+" "+size[-2:] seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) category=self._parse_category(TorrentSearch.htmltools.find_elements(category, "a")[0].prop('href').split('/')[-2]) c=httplib2.Http() resp,content=self.http_queue_request(details_link) tree=libxml2.htmlParseDoc(content,"utf-8") lines=TorrentSearch.htmltools.find_elements(TorrentSearch.htmltools.find_elements(tree, "td", **{'class':'tabledata0'})[0].parent.parent,"tr") for i in lines: cells=TorrentSearch.htmltools.find_elements(i, "td") if cells[0].getContent()=="Info hash:": hashvalue=cells[1].getContent() elif cells[0].getContent()=="Torrent added:": date=cells[1].getContent().split(" ")[0] date=time.strptime(date,"%Y-%m-%d") date=datetime.date(date.tm_year, date.tm_mon, date.tm_mday) self.add_result(ExtraTorrentPluginResult(title, date, size, seeders, leechers, torrent_url, hashvalue, category))
def paso_a_paso(): url = "%s/tips/recetas" % BASE_URL html = urllib.urlopen(url).read() dom = lxml.html.document_fromstring(html) answer = [] serie = models.Serie() serie.title = 'Paso a paso' serie.description = "por Martiniano Molina" serie.url = 'rss://%s/content/elgourmet/paso_a_paso' % settings.MY_BASE_URL serie.thumbnail = dom.cssselect("#cab_logo img")[0].get("src") serie.episodes = [] serie.show_name = 'paso_a_paso' for a in dom.cssselect("#contenedor a"): try: url2 = a.get('href') if not url2.startswith('receta'): continue url2 = urllib.basejoin(BASE_URL, url2) episode = models.Episode() episode.title = a.cssselect("h2")[0].text_content() print "\t%s" % episode.title html2 = urllib.urlopen(url2).read() episode.url = url2 episode.thumbnail = urllib.basejoin(BASE_URL, dom.cssselect("img")[0].get('src')) x = re.findall('"file": ?"(.*?)"', html2) episode.video_url = get_video_url(x[0], STREAMING_URL) serie.episodes.append(episode) except Exception,e: print "Error: %s" % e
def findVideoFrameLink(page, data): minheight=300 minwidth=300 frames = findFrames(data) if not frames: return None iframes = re.findall(data, "(frame[^>]* height=[\"']*(\d+)[\"']*[^>]*>)") if iframes: for iframe in iframes: height = int(iframe[1]) if height > minheight: m = re.findall(iframe[0], "[\"' ]width=[\"']*(\d+[%]*)[\"']*") if m: if m[0] == '100%': width = minwidth+1 else: width = int(m[0]) if width > minwidth: m = re.findall(iframe[0], '[\'"\s]src=["\']*\s*([^"\' ]+)\s*["\']*') if m: link = m[0] if not link.startswith('http://'): up = urlparse.urlparse(urllib.unquote(page)) if link.startswith('/'): link = urllib.basejoin(up[0] + '://' + up[1],link) else: link = urllib.basejoin(up[0] + '://' + up[1] + '/' + up[2],link) return link.strip() # Alternative 1 iframes = re.findall(data, "(frame[^>]*[\"; ]height:\s*(\d+)[^>]*>)") if iframes: for iframe in iframes: height = int(iframe[1]) if height > minheight: m = re.findall(iframe[0], "[\"; ]width:\s*(\d+)") if m: width = int(m[0]) if width > minwidth: m = re.findall(iframe[0], '[ ]src=["\']*\s*([^"\' ]+)\s*["\']*') if m: link = m[0] if not link.startswith('http://'): link = urllib.basejoin(page,link) return link.strip() # Alternative 2 (Frameset) iframes = re.findall(data, '<FRAMESET[^>]+100%[^>]+>\s*<FRAME[^>]+src="([^"]+)"') if iframes: link = iframes[0] if not link.startswith('http://'): link = urllib.basejoin(page,link) return link.strip() return None
def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False): utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta) self.cookiefile = tempfile.NamedTemporaryFile() self.baseurl = 'http://ldemo.mp.nic.in' self.cookieurl = urllib.basejoin(self.baseurl, \ 'causelist/ciskiosk/ordermain.php') self.dateurl = urllib.basejoin(self.baseurl, \ '/causelist/ciskiosk/order_action.php?as9=ok3')
def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False): lobis.Lobis.__init__(self, name, rawdir, metadir, statsdir, updateMeta) self.baseurl = 'http://lobis.nic.in/' self.courturl = urllib.basejoin(self.baseurl, '/phhc/') self.cookieurl = urllib.basejoin(self.baseurl, \ '/phhc/juddt.php?scode=28') self.dateurl = urllib.basejoin(self.baseurl, \ '/phhc/juddt1.php?dc=28&fflag=1')
def _run_search(self,pattern,href=None,page=0): if href==None: href="http://mononoke-bt.org/browse2.php?search="+urllib.quote_plus(pattern) resp,content=self.http_queue_request(href,headers={'Cookie':self._app.parse_cookie(self.login_cookie)}) tree=libxml2.htmlParseDoc(content,"utf-8") pager=htmltools.find_elements(tree.getRootElement(),"div",**{'class':'animecoversfan'})[0].parent.next try: data=htmltools.find_elements(pager,"b")[-1].getContent() i=len(data)-1 while data[i] in "0123456789": i-=1 self.results_count=eval(data[i+1:]) except: pass restable=pager.next.next lines=htmltools.find_elements(restable,"tr",1)[1:-2] for i in lines: try: cells=htmltools.find_elements(i,"td") team, show, stype, name, torrent_link, nbfiles, nbcmt, rate, date, size, views, dl, seeders, leechers, ratio=cells link=htmltools.find_elements(name,"a")[0] label=link.getContent() link=urllib.basejoin(href,link.prop('href')) torrent_link=urllib.basejoin(href,htmltools.find_elements(torrent_link,"a")[0].prop('href'))+"&r=1" date=htmltools.find_elements(date,"nobr")[0].children.getContent() date=time.strptime(date,"%Y-%m-%d") date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday) strsize="" cell=size.children while cell: if cell.name=="text": if strsize: strsize+=" " strsize+=cell.getContent().upper() cell=cell.next size=strsize.replace('O','B') seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) resp,content=self.http_queue_request(link,headers={'Cookie':self._app.parse_cookie(self.login_cookie)}) itemtree=libxml2.htmlParseDoc(content,"utf-8") tds=htmltools.find_elements(itemtree.getRootElement(),"td") hashvalue=None for j in tds: if j.getContent()=="Info hash": hashvalue=j.next.next.getContent() self.add_result(MononokeBTPluginResult(label,date,size,seeders,leechers,torrent_link,hashvalue)) except: pass if self.stop_search: return if not self.stop_search: try: b=htmltools.find_elements(pager,"b")[-1] if b.parent.name=="a": url="http://mononoke-bt.org/browse2.php?search=%s&page=%d"%(urllib.quote_plus(pattern),page+1) self._run_search(pattern,url,page+1) except: pass
def download_oneday(self, relpath, dateobj): dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php') postdata = [('d1', dateobj.day), ('m1', dateobj.month), \ ('y1', dateobj.year), ('d2', dateobj.day), \ ('m2', dateobj.month), ('y2', dateobj.year), \ ('button', 'Submit')] webpage = self.download_url(dateurl, postdata = postdata) if not webpage: self.logger.warning(u'No webpage for %s date: %s' % \ (dateurl, dateobj)) return [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'HTML parsing failed for date: %s' % dateobj) return [] newdls = [] for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.warning(u'Could not process %s' % link) continue words = href.split('/') filename = words[-1] url = urllib.basejoin(dateurl, href) self.logger.info(u'link: %s title: %s' % (href, title)) relurl = os.path.join (relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url) if not webpage: self.logger.warning(u'No webpage %s' % url) else: utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % url) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(title, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls
def __iter__(self): self.checkOptions() for item in self.previous: if not self.target: yield item continue keys = item.keys() # Apply defaultMatcher() function to extract necessary data # 1) which item will be transitioned # 2) with which transition pathkey = self.pathkey(*keys)[0] transitionskey = self.transitionskey(*keys)[0] if not (pathkey and transitionskey): # not enough info yield item continue path, transitions = item[pathkey], item[transitionskey] if isinstance(transitions, basestring): transitions = (transitions,) remote_url = urllib.basejoin(self.target, path) if not remote_url.endswith("/"): remote_url += "/" for transition in transitions: transition_trigger_url = urllib.basejoin(remote_url, "content_status_modify?workflow_action=" + transition) self.logger.info("%s performing transition '%s'" % (path, transition)) from httplib import HTTPException try: f= urllib.urlopen(transition_trigger_url) data = f.read() # Use Plone not found page signature to detect bad URLs if "Please double check the web address" in data: import pdb ; pdb.set_trace() raise RuntimeError("Bad remote URL:" + transition_trigger_url) except HTTPException, e: # Other than HTTP 200 OK should end up here, # unless URL is broken in which case Plone shows # "Your content was not found page" self.logger.error("fail") msg = "Remote workflow transition failed %s->%s" %(path,transition) self.logger.log(logging.ERROR, msg, exc_info=True) yield item
def _run_search(self,pattern,href=None): if href==None: href="http://www.torrent411.com/search/"+urllib.quote_plus(pattern) resp,content=self.http_queue_request(href) content=_codecs.utf_8_encode(_codecs.latin_1_decode(content)[0])[0] tree=libxml2.htmlParseDoc(content,"utf-8") pager=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(),"table",**{'class':'NB-frame'})[1],"p")[0] try: b=htmltools.find_elements(pager,"b")[-1] data=b.getContent() i=len(data)-1 while data[i] in "012346789": i-=1 self.results_count=eval(data[i+1:]) except: pass restable=htmltools.find_elements(pager.next.next,"table")[0] restable=htmltools.find_elements(restable,"table")[1] body=htmltools.find_elements(restable,"tbody")[0] lines=htmltools.find_elements(body,"tr",1) for i in lines: try: cat,link,a,date,b,c,d,e,f,g,h,i,size,j,seeders,leechers=htmltools.find_elements(i,"td") date=date.getContent().replace(chr(194)+chr(160)+"at"+chr(194)+chr(160)," ") date=time.strptime(date,"%Y-%m-%d %H:%M:%S") date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday) size=size.getContent().replace(chr(194)+chr(160)," ") seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) link=htmltools.find_elements(link,"a")[0] label=link.prop('title') link=urllib.basejoin("http://www.torrent411.com",link.prop('href')) resp,content=self.http_queue_request(link) content=_codecs.utf_8_encode(_codecs.latin_1_decode(content)[0])[0] itemtree=libxml2.htmlParseDoc(content,"utf-8") table=htmltools.find_elements(itemtree.getRootElement(),"table",**{'cellpadding':'3'})[1] desc,name,torrent,cat,siz,hashvalue=htmltools.find_elements(table,"tr")[:6] torrent=htmltools.find_elements(torrent,"a")[0].prop('href') hashvalue=htmltools.find_elements(hashvalue,"td")[1].getContent() self.add_result(Torrent411PluginResult(label,date,size,seeders,leechers,torrent,hashvalue)) except: pass if self.stop_search: return if not self.stop_search: try: links=htmltools.find_elements(pager,"a") next_link=None for i in links: if i.getContent()=="Next"+chr(194)+chr(160)+">>": next_link=i if next_link: link=urllib.basejoin("http://www.torrent411.com",next_link.prop('href')) self._run_search(pattern,link) except: pass
def __init__(self, name, rawdir, metadir, statsdir, updateMeta=False): utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta) self.baseurl = "http://patnahighcourt.bih.nic.in" self.hostname = "patnahighcourt.bih.nic.in" self.dateurl = urllib.basejoin(self.baseurl, "/judgment/judgDateWise.aspx") self.formaction = "judgDateWise.aspx" self.cookiefile = tempfile.NamedTemporaryFile() self.cookieurl = urllib.basejoin(self.baseurl, "/judgment/default.aspx") self.download_url(self.cookieurl, savecookies=self.cookiefile.name)
def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False): utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta) self.baseurl = 'http://rti.india.gov.in' self.dateurl = urllib.basejoin(self.baseurl, \ '/decision_categorywise.php') self.posturl = self.dateurl self.resulturl = urllib.basejoin(self.dateurl, \ '/result_decision_categorywise.php') self.cookiefile = tempfile.NamedTemporaryFile()
def get_all_mp3(url): '''get all mp3 from a url''' data = urllib2.urlopen(url).read() re_com = re.compile('http://.*?\.mp3') all = re_com.findall(data) re_com = re.compile('<a href=\"(.*?\.mp3)\"') ll = re_com.findall(data) for i in ll: if urllib.basejoin(url,i) not in all: all.append(urllib.basejoin(url,i)) return list(set(all)) #删除重复歌曲
def scrap_serie(self, serie): url = serie.url html = urllib.urlopen(url).read() soup = BeautifulSoup(html, from_encoding='utf-8') videos = soup.find('article','videos-list fod') if videos is None: return videos = list(videos('li','video')) + list(videos('li','video last')) for li in videos: url = urllib.basejoin(self.BASE_URL, li.a.get('href')) thumbnail = urllib.basejoin(self.BASE_URL, li.img.get('src')) self.scrap_episode(serie, url, thumbnail)
def _run_search(self,pattern, page_url=''): http=httplib2.Http() headers={'Cookie':self.login_cookie} if page_url=="": page_url="http://www.bakabt.com/browse.php?q="+urllib.quote(pattern) resp,content=http.request(page_url,headers=headers) tree=libxml2.htmlParseDoc(content,"utf-8") try: data=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(), "div", **{'class':'pager'})[0], "a")[-2].getContent() i=len(data)-1 while i>=0 and data[i] in "0123456789": i-=1 self.results_count=eval(data[i+1:]) except: pass results_table=htmltools.find_elements(tree.getRootElement(),"table",**{'class':'torrents'})[0] lines=htmltools.find_elements(results_table,"tr")[1:] is_alt=False for i in range(len(lines)): try: line=lines[i] if "torrent_alt" in line.prop('class') and not is_alt: is_alt=True continue if not "torrent_alt" in line.prop('class'): is_alt=False cells=htmltools.find_elements(line,"td") if len(cells)==6: category, details, comments, date, size, transfers = cells else: details, comments, date, size, transfers = cells day,month,year=date.getContent().replace("'","").split(" ") day=eval(day) year=eval("20"+year) month=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'].index(month)+1 date=datetime.date(year,month,day) seeders,leechers=htmltools.find_elements(transfers,"a") seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) size=size.getContent() link=htmltools.find_elements(details,"a")[0] label=link.getContent() link=urllib.basejoin(page_url,link.prop('href')) self.add_result(BakaBTPluginResult(label,date,size,seeders,leechers,link)) except: pass if self.stop_search: return if not self.stop_search: link=htmltools.find_elements(htmltools.find_elements(tree.getRootElement(), "div", **{'class':'pager'})[0], "a")[-1] if link.prop('class')!='selected': self._run_search(pattern, urllib.basejoin(page_url, link.prop('href')))
def _run_search(self,pattern,href=None): if href==None: href="http://linuxtracker.org/index.php?page=torrents&search="+urllib.quote_plus(pattern) resp,content=self.http_queue_request(href) tree=libxml2.htmlParseDoc(content,"utf-8") try: pager=htmltools.find_elements(tree.getRootElement(),"form",name="change_page")[0] options=htmltools.find_elements(pager,"option") self.results_count=50*len(options) except: pager=None self.results_count=50 restable=htmltools.find_elements(tree.getRootElement(),"table",**{'class':'lista'})[1] lines=htmltools.find_elements(restable,"tr")[1:] for i in lines: try: cat,link,torrent_link,date,seeders,leechers,a,b=htmltools.find_elements(i,"td") label=link.getContent() link=urllib.basejoin(href,htmltools.find_elements(link,"a")[0].prop('href')) torrent_link=urllib.basejoin(href,htmltools.find_elements(torrent_link,"a")[0].prop('href')) date=time.strptime(date.getContent(),"%d/%m/%Y") date=datetime.date(date.tm_year,date.tm_mon,date.tm_mday) seeders=eval(seeders.getContent()) leechers=eval(leechers.getContent()) resp,content=self.http_queue_request(link) itemtree=libxml2.htmlParseDoc(content,"utf-8") table=htmltools.find_elements(itemtree.getRootElement(),"table",**{'class':'coltable'})[0] size=None hashvalue=None for td in htmltools.find_elements(table,"td"): if td.getContent()=="Size" and size==None: size=td.next.next.getContent() if td.getContent()=="Info Hash" and hashvalue==None: hashvalue=td.next.next.getContent() self.add_result(linuxTRACKERPluginResult(label,date,size,seeders,leechers,torrent_link,hashvalue)) except: pass if self.stop_search: return if not self.stop_search: try: if pager: spans=htmltools.find_elements(pager,"span") i=0 while i<len(spans) and spans[i].prop('class')!='pagercurrent': i+=1 i+=1 if i<len(spans): link=htmltools.find_elements(spans[i],"a")[0] link=urllib.basejoin(href,link.prop('href')) self._run_search(pattern,link) except: pass
def urlMerge(params, src): paramArr = __parseParams(params) paramTrunk = paramArr[0].replace('%s', src).replace("\t","") paramFile= paramArr[1].replace('%s', src).replace("\t","") if not paramFile.startswith('http'): up = urlparse.urlparse(urllib.unquote(paramTrunk)) if paramFile.startswith('/'): return urllib.basejoin(up[0] + '://' + up[1], paramFile) else: return urllib.basejoin(up[0] + '://' + up[1] + '/' + up[2],paramFile) return src
def get_item_playable(idItem): urlToLoad = urllib.basejoin(VVVVID_BASE_URL, idItem + '/info') data = getJsonDataFromUrl(urlToLoad) info = data['data'] itemPlayable = ItemPlayableChannel() itemPlayable.title = info['title'] itemPlayable.thumb = urllib.basejoin(VVVVID_STATIC_URL, info['thumbnail']) + '|' + HEADERS_ENCODED itemPlayable.id = info['id'] itemPlayable.show_id = info['show_id'] itemPlayable.ondemand_type = info['ondemand_type'] itemPlayable.show_type = info['show_type'] itemPlayable = get_seasons_for_item(itemPlayable) return itemPlayable
def __iter__(self): basepath = xmlrpclib.ServerProxy(self.target).getPhysicalPath() for item in self.previous: keys = item.keys() typekey = self.typekey(*keys)[0] pathkey = self.pathkey(*keys)[0] if not (typekey and pathkey): # not enough info yield item; continue type_, path = item[typekey], item[pathkey] #fti = self.ttool.getTypeInfo(type_) #if fti is None: # not an existing type # msg = "constructor: no type found %s:%s" % (type_,path) # logger.log(logging.ERROR, msg) # yield item; continue elems = path.strip('/').rsplit('/', 1) for attempt in range(0, 3): try: url = urllib.basejoin(self.target, path) proxy = xmlrpclib.ServerProxy(url) container, id = (len(elems) == 1 and ('', elems[0]) or elems) #if id == 'index.html': try: #test paths in case of acquition rpath = proxy.getPhysicalPath() rpath = rpath[len(basepath):] if path == '/'.join(rpath): break except xmlrpclib.Fault: pass purl = urllib.basejoin(self.target,container) pproxy = xmlrpclib.ServerProxy(purl) try: pproxy.invokeFactory(type_, id) except xmlrpclib.ProtocolError,e: if e.errcode == 302: pass else: raise break except xmlrpclib.ProtocolError,e: if e.errcode == 503: continue else: raise
def plot(self, filename=None, open=False): if isinstance(filename, str): if filename[-4:].lower() != '.png': filename += ".png" os.system('kst --png %s %s' % (filename, urllib.basejoin(self.db, str(self.dataset), 'kst'))) if open: os.system('xdg-open %s &' % filename) else: os.system('kst %s &' % urllib.basejoin(self.db, str(self.dataset), 'kst'))
def playset(self,request,response, resultset): try: streamerURL = urllib.basejoin(self.getWebApp().getURLprefix(), "streamer.sn") baseURL=self.getAppContext().streamURLbase or request.getBaseURL() reply = "#EXTM3U\n" for result in resultset: reply=reply + "#EXTINF:-1," + result[1] + '\n' + \ urllib.basejoin(baseURL,streamerURL+'?id='+`id(result[1])`)+'\n' # response.setContentType("audio/x-mpegurl") response.setContentType("audio/mpegurl") response.setContentLength(len(reply)) response.getOutput().write(reply) except ValueError: response.sendError(501, "Invalid command args")
def add_css(self, css="", cache=True, vendor=False, **kwargs): if css.startswith('http'): path = css elif vendor: path = urllib.basejoin(options.vendor_css_root, css) else: path = urllib.basejoin(options.static_root, '%s/css/' % options.site_name) path = urllib.basejoin(path, css) cachestring = ('' if cache or not options.debug else '?cacheid=%s' % CACHID) extra_params = "" for item in kwargs.iteritems(): extra_params += '%s="%s" ' % item return """<link rel="stylesheet" href="%s%s" type="text/css" %s/>""" \ % (path, cachestring, extra_params)
def __init__(self, name, rawdir, metadir, statsdir, updateMeta = False): utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta) self.hostname = 'gujarathc-casestatus.nic.in' self.baseurl = 'http://gujarathc-casestatus.nic.in/' self.pageurl = urllib.basejoin(self.baseurl, \ '/gujarathc/SearchHCJudge') self.caseurl = urllib.basejoin(self.baseurl, \ '/gujarathc/GetOrderDateNew') self.orderurl = urllib.basejoin(self.baseurl, \ '/gujarathc/OrderHistoryViewDownload') self.cookiefile = tempfile.NamedTemporaryFile() self.download_url(self.baseurl, \ savecookies = self.cookiefile.name)
def __getattribute__(self, name): """ Overwritten to ensure data is always synchronised with the DB. """ if name == 'data': # Check if remote data is unaltered url = urllib.basejoin(self.db, "/%d/diff" % self.dataset) updated = int(urllib2.urlopen(url).read()) if self.updated != updated: # Fetch remote data url = urllib.basejoin(self.db, "/%d/py" % self.dataset) data, _ = self.__prep_data(urllib2.urlopen(url).read()) object.__setattr__(self, 'data', data) self.updated = updated return object.__getattribute__(self, name)
def parse_result_page(self, posturl, webpage, dateobj): judgments = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) return judgments # get judgments trs = d.findAll('tr') for tr in trs: judgment = {} metainfo = { 'date': utils.date_to_xml(dateobj)} links = tr.findAll('a') for link in links: href = link.get('href') if href and re.search('WebShowJudgment.do', href): t = utils.get_tag_contents(link) colon = t.find(':') if colon: title = t[colon+1:] title = title.strip() metainfo['title'] = title reobj = re.search(' vs\. ', title, re.IGNORECASE) if reobj: metainfo['petitioner'] = title[:reobj.start()] metainfo['respondent'] = title[reobj.end():] if href and re.search('WebDownloadJudgmentDocument.do', href): judgment['link'] = urllib.basejoin(posturl, href) if judgment: judgment['metainfo'] = metainfo judgments.append(judgment) # next link links = d.findAll('a') for link in links: t = utils.get_tag_contents(link) if re.search('Next', t): href = link.get('href') if href: judgment = {'link': urllib.basejoin(posturl, href)} judgment['next'] = True judgments.append(judgment) return judgments
def get(query, relative, outdir, listonly=False): page = 1 while 1: params = dict( q = query, type = "Code", p = page ) r = requests.get(SEARCH, params=params) if is_last_page(r.content): print("** No more results") break for u in extract(r.content): ru = raw_url(u) if relative: ru = urllib.basejoin(ru, relative) if listonly: print(ru) else: fn = make_fname(u) outpath = os.path.join(outdir, fn) if os.path.exists(outpath): print("Skipping ", fn) else: ret = requests.get(ru) if ret.status_code == 200: print("Fetching ", ru) f = open(outpath, "w") f.write(ret.content) f.close() else: print("Error", fn, ret.status_code) page += 1
def __call__(self, url, baseURL=None): """Load the given multi-value url and call callbacks url -- vrml97-style url (multi-value string) baseURL -- optional base url from which items in url will be resolved. protofunctions.root(node).baseURI will give you the baseURL normally used for the given node. raises IOError on failure returns (successfulURL, filename, open_file, headers) on success headers will be None for local files """ log.info("Loading: %s, %s", url, baseURL) if isinstance(url, (str, unicode)): url = [url] file = None for u in url: # get the "absolute" url if baseURL: u = urllib.basejoin(baseURL, u) resolvedURL, file, filename, headers = self.get(u) if file is not None and filename is not None: break if not file or not filename: raise IOError("""Unable to download url %s""" % url) return (resolvedURL, os.path.abspath(filename), file, headers)
def _downloadDecisions(self, soup): re_descPattern = re.compile( 'Beslutsdatum: (\d+-\d+-\d+) Diarienummer: (.*)') for result in soup.first('div', {'class': 'SearchResult'}): if result.a['href']: url = urllib.basejoin("http://www.jo.se/", result.a['href']) # Seems to be a bug in BeautifulSoup - properly # escaped & entities are not de-escaped url = url.replace('&', '&') desc = result.contents[-1].string m = re_descPattern.match(desc) beslutsdatum = m.group(1) id = m.group(2) filename = id.replace('/', '-') + ".html" resource = LegalSource.DownloadedResource(id) resource.url = url resource.localFile = filename log.info(u'Storing %s as %s' % (url, filename)) Robot.Store(url, None, self.dir + "/" + id.replace('/', '-') + ".html") resource.fetched = time.localtime() if id in self.ids: log.warn(u'replacing URL of id %s to %s (was %s)' % (id, url, self.ids[id].url)) self.ids[id] = resource
def parse(url): try: req = urllib2.Request(url) req.add_header('User-agent', 'Mozilla 5.10') content = urllib2.urlopen(req, timeout=2).read() content = unicode(content, 'utf-8') content = clean(content) except urllib2.URLError: return [] tree = etree.HTML(content) seg_list = extract( post_clean(''.join( tree.xpath('//p//text()|//strong//text()|' '//span//text()|//a//text()|//li//text()')))) log('URL: %-60s|' % url[:60], seg_list, 'Remains:' + str(download_queue.qsize())) n_urls = map(lambda x: basejoin(url, x), tree.xpath('//a/@href')) n_urls = filter(lambda x: x.startswith('http'), n_urls) n_urls = filter(lambda x: x.split('.', 1)[1].startswith(domain), n_urls) ret_urls = [] for url in n_urls: if r.get(url) is not None: continue else: r.set(url, 1) ret_urls.append(url) return ret_urls
def print_rep_table(repository): print "<table class=table cellpadding=3 cellspacing=0><tr class=table_header>" for title in ["Name", "Size", "Type", "Time", "Info"]: print "<td>%s</td>" % (title) print "</tr>" odd = True for x in os.listdir(repository.path): _, ext = os.path.splitext(x) if ext in EXT_TO_SHOW: fullname = join(realpath(repository.path), x) fullurl = quote(basejoin(repository.url + "/", x), ":/") if odd: print "<tr class=odd_row>" else: print "<tr class=even_row>" odd = not odd name, info = get_info(fullname) info = provisioning.crlf_to_cr(info).replace("\n", "<br>") print '<td><a href="%s">%s</a></td>' % (fullurl, name) print "<td class=size_column>%s</td>" % (str( os.path.getsize(fullname))) print "<td class=type_column><em>%s</em></td>" % ( ext) # (provisioning.get_mime_type(x)) print "<td class=time_column>%s</td>" % (time.ctime( os.path.getctime(fullname))) print "<td class=info_column>%s</td></tr>" % (info)
def get_context(context): """generate rss feed""" host = get_request_site_address() blog_list = frappe.db.sql("""\ select page_name as name, published_on, modified, title, content from `tabBlog Post` where ifnull(published,0)=1 order by published_on desc limit 20""", as_dict=1) for blog in blog_list: blog_page = cstr(urllib.quote(blog.name.encode("utf-8"))) + ".html" blog.link = urllib.basejoin(host, blog_page) blog.content = escape_html(blog.content or "") if blog_list: modified = max((blog['modified'] for blog in blog_list)) else: modified = now() ws = frappe.doc('Website Settings', 'Website Settings') context = { 'title': ws.title_prefix, 'description': ws.description or ((ws.title_prefix or "") + ' Blog'), 'modified': modified, 'items': blog_list, 'link': host + '/blog' } # print context return context
def process_dir_xml(repository): try: print provisioning.XML_HEADER print '<?xml version="1.0" encoding="utf-8"?>' print """<serverContent xmlns="http://sun.com/2006/provisioning" xmlns:dd="urn:oma:xml:dl:dd:2.0" xmlns:xsd="http://www.w3.org/2001/XMLSchema-instance" xsd:schemaLocation="http://sun.com/2006/provisioning servercontent.xsd"> """ for x in os.listdir(repository.path): _, ext = os.path.splitext(x) if ext in EXT_TO_SHOW: print """<dd:media xmlns="urn:oma:xml:dl:dd:2.0" DDVersion="2.0"> <product><mediaObject>""" fullname = join(realpath(repository.path), x) fullurl = quote(basejoin(repository.url + "/", x), ":/") name, info = get_info(fullname) print "<meta><name>%s</name></meta>" % (name) print "<size>%d</size>" % (os.path.getsize(fullname)) print "<type>%s</type>" % (provisioning.get_mime_type(x)) print "<objectID>%s</objectID>" % (fullurl) print "<objectURI><server>%s</server></objectURI>" % fullurl # (cgi.escape(objectURI)) print "</mediaObject></product></dd:media>" print "</serverContent>" except Exception, e: sys.stderr.write("%s, %s" % (Exception, e))
def process_judgment_page(self, relpath, url, dateobj): webpage = self.download_url(url, loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download %s' % url) return None d = utils.parse_webpage(webpage) if not d: self.logger.warning(u'Could not parse %s' % url) return None metainfo = self.get_meta_info(d, dateobj) for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.warning(u'Could not process %s' % link) continue action = self.action_on_link(href, title) newurl = urllib.basejoin(url, href) if action == 'save': self.logger.info(u'Downloading %s' % newurl) return self.get_judgment(relpath, newurl, title, metainfo) return None
def appendURL(self, url, included=0): """Append packages from the database with the given URL. Only the first database should specify included=0, so the global information (maintainer, description) get stored.""" if url in self._urllist: return self._urllist.append(url) fp = urllib2.urlopen(url).fp plistdata = plistlib.Plist.fromFile(fp) # Test here for Pimp version, etc if included: version = plistdata.get('Version') if version and version > self._version: sys.stderr.write( "Warning: included database %s is for pimp version %s\n" % (url, version)) else: self._version = plistdata.get('Version') if not self._version: sys.stderr.write( "Warning: database has no Version information\n") elif self._version > PIMP_VERSION: sys.stderr.write( "Warning: database version %s newer than pimp version %s\n" % (self._version, PIMP_VERSION)) self._maintainer = plistdata.get('Maintainer', '') self._description = plistdata.get('Description', '').strip() self._url = url self._appendPackages(plistdata['Packages'], url) others = plistdata.get('Include', []) for o in others: o = urllib.basejoin(url, o) self.appendURL(o, included=1)
def delete(self, node='', data='', level='dataset', rmSubscriptions='y', comments='', format='json', instance='prod'): name = "delete" if not (node and data): self.logger.error(name, "Need to pass both node and data") return 1, "Error" values = { 'node': node, 'data': data, 'level': level, 'rm_subscriptions': rmSubscriptions, 'comments': comments } deleteURL = urllib.basejoin(self.phedexBase, "%s/%s/delete" % (format, instance)) check, response = self.phedexCall(deleteURL, values) if check: self.logger.error(name, "Delete call failed") return 1, "ERROR - self.phedexCall with response: " + response return 0, response
def scrape(self): dic = BeautifulSoupScraper.scrape(self) text = dic.get('title', '') + '\n' + dic.get('text', '') images = dic.get('images', []) if type(images) != list: images = [images] images = [urllib.basejoin(self.url, i) for i in images] return text, images
def grep_items(config, url=None, page=1): items = list() if not url: url = config['main_url'] try: internet_lock.acquire() # авито не любит одновременные запросы sleep(1) html = urllib.urlopen(url).read() internet_lock.release() if not html: raise IOError except IOError: print 'error while open url %s' % url else: bs = bs4.BeautifulSoup(html) if bs.find( 'input', {'id': 'search'})['value'].lower() == config['search'].lower(): # авито упрощает поисковый запрос, если ничего не найдено bs_items = bs.findAll('div', {'class': 'item'}) for item in bs_items: items.append(Item(config, item)) next_page = bs.find('a', {'class': 'pagination__page'}, text=u'\n Следующая страница →\n ') if next_page: items += grep_items( config, urllib.basejoin(config['site'], next_page['href']), page + 1) if page == 1: config['last_len'] = len(items) config['last_check'] = datetime.datetime.now() print_status() return items
def upload_scrawl_file(self, request): """ 功能:处理涂鸦文件上传 """ try: action = request.GET.get("action", "") form_name = self.get_action_form_name(action) content=request.POST.get(form_name) upload_file = ContentFile(base64.decodestring(content)) scrawl_default_name = "{}.png".format(str(int(time.time()))) # 默认涂鸦文件命名规则 store_path = self._get_upload_path(scrawl_default_name) _file_name, upload_file_suffix = os.path.splitext(store_path) upload_file.name = _file_name self.storage.save(store_path, upload_file) rst = { 'state': 'SUCCESS', 'url': urllib.basejoin(self.settings.TUEDITOR_MEDIA_URL, store_path), 'original': upload_file.name, 'type': upload_file_suffix.replace(".", ""), 'size': upload_file.size, } except Exception,E: rst = { 'state': "写入图片文件错误:%s" % E.message, }
def put(self, title, body, revision=None, comment='', format='json'): logger.info('[put] %s size: %d revision:%s comment:%s', title, len(body), revision, comment) if revision is None: _resp, data = self.get(title) revision = data['revision'] url = urllib.basejoin(self.baseurl, title) data = urllib.urlencode({ 'title': title, 'body': body, 'revision': revision, 'comment': comment or self.DEFAULT_COMMENT }) try: resp, content = self._request(url, format=format, method='PUT', body=data) # TODO: handle 406, 409 try: content = json.loads(content) except Exception as e: logger.error('[put] json load error: %s', e) return resp, content except HTTPError as e: logger.error("[put] %d %s", e.code, e.msg) raise
def result_page(self, relpath, url, dateobj, linkdict): newdls = [] webpage = self.download_url(url, loadcookies = self.cookiefile.name) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title) or linkdict.has_key(href): self.logger.warning(u'Could not process %s' % link) continue linkdict[href] = 1 action = self.action_on_link(href, title) self.logger.info(u'Action %s on link %s title %s' %\ (action, href, title)) newurl = urllib.basejoin(url, href) if action == 'judgmentlink': relurl = self.process_judgment_page(relpath, newurl, dateobj) if relurl: newdls.append(relurl) else: self.logger.warning(u'Judgment link not working %s' % newurl) elif action == 'recurse': newdls.extend(self.result_page(relpath, newurl, dateobj, linkdict)) return newdls
def language_url(self, language): """Get the dump location for given language :param language: ISO 631 language code :type language: string """ return urllib.basejoin(self._host, self.language_dir(language))
def __init__(self, srcdir, rawdir, metadir, statsdir, updateMeta = False): utils.BaseCourt.__init__(self, srcdir, rawdir, metadir, statsdir, updateMeta) self.baseurl = 'http://judgmenthck.kar.nic.in' self.hostname = 'judgmenthck.kar.nic.in' self.courturl = urllib.basejoin(self.baseurl, '/judgments/') self.cookiefile = tempfile.NamedTemporaryFile() self.get_cookies()
def upload_file(self, request): """ 功能:处理文件上传 """ action = request.GET.get("action", "") upload_form_name = self.get_action_form_name(action) upload_file = request.FILES.get(upload_form_name) upload_file_name, upload_file_suffix = os.path.splitext(upload_file.name) if not self.is_size_allow(action, upload_file.size): return JsonResponse({"state":"文件尺寸不符合要求"}) if not self.is_suffix_allow(action, upload_file_suffix): return JsonResponse({"state":"文件格式不符合要求"}) store_path = self._get_upload_path(upload_file.name, action) self.storage.save(store_path, upload_file) rst = { 'state': 'SUCCESS', 'url': urllib.basejoin(self.settings.TUEDITOR_MEDIA_URL, store_path), 'original': upload_file.name, 'type': upload_file_suffix.replace(".", ""), 'size': upload_file.size, } return JsonResponse(rst)
def get_context(context): """generate the sitemap XML""" host = get_request_site_address() links = [] for page in get_pages(): if not page.no_sitemap: links.append({ "loc": urllib.basejoin(host, urllib.quote(page.name.encode("utf-8"))), "lastmod": "2014-01-01" }) def add_links(doctype, condition_field, order_by): meta = frappe.get_meta(doctype) page_name = "page_name" condition = "" if meta.get_field("parent_website_route"): page_name = """concat(ifnull(parent_website_route, ""), if(ifnull(parent_website_route, "")="", "", "/"), page_name)""" if condition_field: condition ="where ifnull({0}, 0)=1".format(condition_field) for route in frappe.db.sql("select {0}, modified from `tab{1}` {2}".format(page_name, doctype, condition)): if route[0]: links.append({ "loc": urllib.basejoin(host, urllib.quote(route[0].encode("utf-8"))), "lastmod": get_datetime(route[1]).strftime("%Y-%m-%d") }) process_generators(add_links) return {"links":links}
def appendURL(self, url, included=0): if url in self._urllist: return self._urllist.append(url) fp = urllib2.urlopen(url).fp plistdata = plistlib.Plist.fromFile(fp) if included: version = plistdata.get('Version') if version and version > self._version: sys.stderr.write( 'Warning: included database %s is for pimp version %s\n' % (url, version)) else: self._version = plistdata.get('Version') if not self._version: sys.stderr.write( 'Warning: database has no Version information\n') elif self._version > PIMP_VERSION: sys.stderr.write( 'Warning: database version %s newer than pimp version %s\n' % (self._version, PIMP_VERSION)) self._maintainer = plistdata.get('Maintainer', '') self._description = plistdata.get('Description', '').strip() self._url = url self._appendPackages(plistdata['Packages'], url) others = plistdata.get('Include', []) for o in others: o = urllib.basejoin(url, o) self.appendURL(o, included=1)
def get_imagelinks(url): """Given a URL, get all images linked to by the page at that URL.""" # Check if BeautifulSoup is imported. if isinstance(BeautifulSoup, ImportError): raise BeautifulSoup links = [] uo = URLopener() with uo.open(url) as f: soup = BeautifulSoup(f.read()) if not shown: tagname = 'a' elif shown == 'just': tagname = 'img' else: tagname = ['a', 'img'] for tag in soup.findAll(tagname): link = tag.get('src', tag.get('href', None)) if link: ext = os.path.splitext(link)[1].lower().strip('.') if ext in fileformats: links.append(urllib.basejoin(url, link)) return links
def redirect(self, URL, request, response): URL = urllib.basejoin(self.urlpattern, URL) # make always absolute... if response.header_written or response.redirection_performed: del request, response raise RuntimeError( 'can not redirect twice or when getOutput() has been called') request.server.redirect(URL, request, response)
def mkUrl(self, service): "Generate Safe Browsing API URL" url = urllib.basejoin(self.config['base_url'], service) query_params = '&'.join( ['%s=%s' % (k, v) for k, v in self.config['url_args'].items()]) url = '%s?%s' % (url, query_params) return url
def __init__(self, name, rawdir, metadir, statsdir, updateMeta=False): utils.BaseCourt.__init__(self, name, rawdir, metadir, statsdir, updateMeta) self.baseurl = 'http://jhr.nic.in' self.hostname = 'jhr.nic.in' self.dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php')
def delete(self, node='', data='', level='dataset', rmSubscriptions='y', comments='', format='json', instance='prod'): """ _delete_ Set up subscription call to PhEDEx API. """ if not (node and data): return 1, " Error - need to pass both node and data" values = { 'node': node, 'data': data, 'level': level, 'rm_subscriptions': rmSubscriptions, 'comments': comments } deleteURL = urllib.basejoin(self.phedexBase, "%s/%s/delete" % (format, instance)) check, response = self.phedexCall(deleteURL, values) if check: return 1, " Error - self.phedexCall with response: " + response return 0, response
def get_url(uri=None, full_address=False): """get app url from request""" host_name = frappe.local.conf.host_name if not host_name: if hasattr(frappe.local, "request" ) and frappe.local.request and frappe.local.request.host: protocol = 'https' == frappe.get_request_header( 'X-Forwarded-Proto', "") and 'https://' or 'http://' host_name = protocol + frappe.local.request.host elif frappe.local.site: host_name = "http://{}".format(frappe.local.site) else: host_name = frappe.db.get_value("Website Settings", "Website Settings", "subdomain") if host_name and "http" not in host_name: host_name = "http://" + host_name if not host_name: host_name = "http://localhost" if not uri and full_address: uri = frappe.get_request_header("REQUEST_URI", "") url = urllib.basejoin(host_name, uri) if uri else host_name return url
def parse_extraordinary_webpage(self, d, dateobj, ex_url): minfos = [] result_table = self.find_result_table(d) if result_table == None: self.loger.warn('Could not find result table for date %s', dateobj) return minfos order = None for tr in result_table.find_all('tr'): if not order: order = self.find_result_order(tr) continue link = tr.find('a') if link == None: continue metainfo = self.process_row(tr, order, dateobj) if metainfo: href = link.get('href') if href: gzurl = urllib.basejoin(ex_url, href) metainfo.set_url(gzurl) minfos.append(metainfo) return minfos
def action_sync(self): domain = self.last_sync and [("create_date", ">", self.last_sync) ] or [] partner_domain = self._get_partner_domain(domain) last_sync = self.last_sync for values in self.env["res.partner"].search_read( partner_domain, ["surname", "firstname", "email", "create_date"], order="create_date asc"): email = values["email"] if not email: continue data = { "ne": email, "nn": values["surname"], "ns": values["firstname"] } url = urllib.basejoin( self.url, "wp-content/plugins/newsletter-api/add.php?nk=%s" % self.api_key) res = requests.post(url, data=data) if res.ok: _logger.info("Registered newsletter for %s" % email) last_sync = max(last_sync, values["create_date"]) else: _logger.error("Unable to register newsletter for %s" % email) _logger.error(res.text) self.last_sync = last_sync
def download_extraordinary(self, dls, relpath, dateobj): ex_url = urllib.basejoin(self.baseurl, self.extraordinary_url % dateobj.year) response = self.download_url(ex_url) if not response or not response.webpage: self.logger.warn( 'Unable to download Extraordinary gazette for year %d', dateobj.year) return d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn( 'Unable to parse Extraordinary gazette list for year %d', dateobj.year) return if dateobj.year == 2010: minfos = self.parse_listing_webpage(ex_url, d, dateobj, None, 'Extraordinary') else: minfos = self.parse_extraordinary_webpage(d, dateobj, ex_url) self.download_metainfos(minfos, dls, relpath)
def cleanup_attrs(self, tag, attrs): new_attrs = [] tag = string.lower(tag) if self._new_window and tag == "a": new_attrs.append(('target', '_blank')) for name, value in attrs: name = string.lower(name) if name[:2] == "on": continue ## skip any javascript events if string.lower(value)[:11] == "javascript:": continue if self._map_urls and name in [ "action", "href", "src", "lowsrc", "background" ] and value[:4] == 'cid:': try: value = self._map_urls[value[4:]] except KeyError: pass else: if self._base and name in [ "action", "href", "src", "lowsrc", "background" ]: value = basejoin(self._base, value) if name in ["action", "href", "src", "lowsrc", "background"]: value = 'http://www.google.com/url?sa=D&q=%s' % ( neo_cgi.urlEscape(value)) if self._new_window and tag == "a" and name == "target": continue new_attrs.append((name, value)) return new_attrs