def resolve(self,item,captcha_cb=None,wait_cb=None): item = item.copy() url = self._url(item['url']) item['surl'] = url data = util.request(url) link = re.search('<a class="stahnoutSoubor.+?href=\"([^\"]+)',data) if link: url = self._url(link.group(1)) data = util.request(url) m = re.search('<img src=\"(?P<img>[^\"]+)\" alt=\"Captcha\"',data) cap_id = re.search('<input type=\"hidden\" name=\"_uid_captcha.+?value=\"(?P<cid>[^\"]+)',data) if m and cap_id: cid = cap_id.group('cid') img_data = m.group('img')[m.group('img').find('base64,')+7:] if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) tmp_image = os.path.join(self.tmp_dir,'captcha.png') util.save_data_to_file(base64.b64decode(img_data),tmp_image) code = captcha_cb({'id':cid,'img': tmp_image}) if not code: return data = util.post(url+'?do=stahnoutFreeForm-submit',{'_uid_captcha':cid,'captcha':code,'stahnoutSoubor':'Stáhnout'}) countdown = re.search('shortly\.getSeconds\(\) \+ (\d+)',data) last_url = re.search('<a class=\"stahnoutSoubor2.+?href=\"([^\"]+)',data) if countdown and last_url: wait = int(countdown.group(1)) url = self._url(last_url.group(1)) wait_cb(wait) req = urllib2.Request(url) req.add_header('User-Agent',util.UA) resp = urllib2.urlopen(req) item['url'] = resp.geturl() return item
def resolve(self,item,captcha_cb=None,wait_cb=None): item = item.copy() url = self._url(item['url']) item['surl'] = url data = util.request(url) link = re.search('<a class="stahnoutSoubor.+?href=\"([^\"]+)',data) if link: url = self._url(link.group(1)) data = util.request(url) m = re.search('<img src=\"(?P<img>[^\"]+)\" alt=\"Captcha\"',data) cap_id = re.search('<input type=\"hidden\" name=\"_uid_captcha.+?value=\"(?P<cid>[^\"]+)',data) if m and cap_id: cid = cap_id.group('cid') img_data = m.group('img')[m.group('img').find('base64,')+7:] if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) tmp_image = os.path.join(self.tmp_dir,'captcha.png') util.save_data_to_file(base64.b64decode(img_data),tmp_image) code = captcha_cb({'id':cid,'img': tmp_image}) if not code: return data = util.post(url+'?do=stahnoutFreeForm-submit',{'_uid_captcha':cid,'captcha':code,'stahnoutSoubor':'Stáhnout'}) countdown = re.search('shortly\.getSeconds\(\) \+ (\d+)',data) last_url = re.search('<a class=\"stahnoutSoubor2.+?href=\"([^\"]+)',data) if countdown and last_url: wait = int(countdown.group(1)) url = self._url(last_url.group(1)) wait_cb(wait) req = urllib2.Request(url) req.add_header('User-Agent',util.UA) resp = urllib2.urlopen(req) item['url'] = resp.geturl() resp.close() return item
def _get_plot(self,data,local): data = util.substr(data,'<div id=\"tale_description\"','<div class=\"cleaner') p = data p = re.sub('<div[^>]+>','',p) p = re.sub('<table.*','',p) p = re.sub('</span>|<br[^>]*>|<ul>|</ul>|<hr[^>]*>','',p) p = re.sub('<span[^>]*>|<p[^>]*>|<li[^>]*>','',p) p = re.sub('<strong>|<a[^>]*>|<h[\d]+>','[B]',p) p = re.sub('</strong>|</a>|</h[\d]+>','[/B]',p) p = re.sub('</p>|</li>','[CR]',p) p = re.sub('<em>','[I]',p) p = re.sub('</em>','[/I]',p) p = re.sub('<img[^>]+>','',p) p = re.sub('\[B\]Edituj popis\[\/B\]','',p) p = re.sub('\[B\]\[B\]','[B]',p) p = re.sub('\[/B\]\[/B\]','[/B]',p) p = re.sub('\[B\][ ]*\[/B\]','',p) util.save_data_to_file(util.decode_html(''.join(p)).encode('utf-8'),local)
def download(remote, local): util.save_data_to_file(util.request(remote), local)
def _get_image(self,data,local): m = re.search('<img id=\"tale_picture\" src=\"(?P<img>[^\"]+)', data, re.IGNORECASE | re.DOTALL) if not m == None: img = self._url(m.group('img')) util.save_data_to_file(util.request(img),local)
def _get_image(self,data,local): data = util.substr(data,'<div class=\"entry-photo\"','</div>') m = re.search('<img(.+?)src=\"(?P<img>[^\"]+)', data, re.IGNORECASE | re.DOTALL) if not m == None: util.save_data_to_file(m.group('img'),local)
def _get_plot(self,data,local): data = util.substr(data,'<div class=\"entry-content\"','</p>') m = re.search('<(strong|b)>(?P<plot>(.+?))<', data, re.IGNORECASE | re.DOTALL) if not m == None: util.save_data_to_file(util.decode_html(m.group('plot')).encode('utf-8'),local)
def resolve(self, item, captcha_cb=None, select_cb=None): item = item.copy() util.init_urllib() url = self._url(item['url']) page = '' try: opener = OpenerDirector() opener.add_handler(HTTPHandler()) opener.add_handler(UnknownHandler()) install_opener(opener) request = Request(url) request.add_header('User-Agent', util.UA) response = urlopen(request) page = response.read() response.close() except HTTPError as e: traceback.print_exc() return data = util.substr(page, '<form method=post target=\"iframe_dwn\"', '</form>') action = re.search('action=(?P<url>[^>]+)', data, re.IGNORECASE | re.DOTALL) img = re.search('<img src=\"(?P<url>[^\"]+)', data, re.IGNORECASE | re.DOTALL) if img and action: sessid = [] for cookie in re.finditer('(PHPSESSID=[^\;]+)', response.headers.get('Set-Cookie'), re.IGNORECASE | re.DOTALL): sessid.append(cookie.group(1)) # we have to download image ourselves image = util.request(self._url(img.group('url')), headers={ 'Referer': url, 'Cookie': sessid[-1] }) img_file = os.path.join(self.tmp_dir, 'captcha.png') util.save_data_to_file(image, img_file) code = None if captcha_cb: code = captcha_cb({'id': '0', 'img': img_file}) if not code: self.info('No captcha received, exit') return request = urllib.urlencode({'code': code}) req = Request(self._url(action.group('url')), request) req.add_header('User-Agent', util.UA) req.add_header('Referer', url) req.add_header('Cookie', sessid[-1]) try: resp = urlopen(req) if resp.code == 302: file_url = resp.headers.get('location') else: file_url = resp.geturl() if file_url.find(action.group('url')) > 0: msg = resp.read() resp.close() js_msg = re.search('alert\(\'(?P<msg>[^\']+)', msg, re.IGNORECASE | re.DOTALL) if js_msg: raise ResolveException(js_msg.group('msg')) self.error(msg) raise ResolveException( 'Nelze ziskat soubor, zkuste to znovu') resp.close() if file_url.find('data') >= 0 or file_url.find( 'download_free') > 0: item['url'] = file_url return item self.error('wrong captcha, retrying') return self.resolve(item, captcha_cb, select_cb) except HTTPError: traceback.print_exc() return
class FastshareContentProvider(ContentProvider): def __init__(self,username=None,password=None,filter=None,tmp_dir='.'): ContentProvider.__init__(self,'fastshare.cz','http://www.fastshare.cz/',username,password,filter,tmp_dir) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.LWPCookieJar())) urllib2.install_opener(opener) def capabilities(self): return ['search','resolve'] def search(self,keyword): return self.list('?term='+urllib.quote(keyword)) def list(self,url): result = [] page = util.request(self._url(url)) data = util.substr(page,'<div class=\"search','<footer') for m in re.finditer('<div class=\"search-result-box(.+?)</a>',data,re.IGNORECASE | re.DOTALL ): it = m.group(1) link = re.search('<a href=([^ ]+)',it,re.IGNORECASE | re.DOTALL) name = re.search('title=\"([^\"]+)',it,re.IGNORECASE | re.DOTALL) img = re.search('<img src=\"([^\"]+)',it,re.IGNORECASE | re.DOTALL) size = re.search('<div class=\"fs\">([^<]+)',it,re.IGNORECASE | re.DOTALL) time = re.search('<div class=\"vd\">([^<]+)',it,re.IGNORECASE | re.DOTALL) if name and link: item = self.video_item() item['title'] = name.group(1) if size: item['size'] = size.group(1).strip() if time: item['length'] = time.group(1).strip() item['url'] = self._url(link.group(1)) item['img'] = self._url(img.group(1)) self._filter(result,item) next = re.search('<a href=\"(?P<url>[^\"]+)[^>]+>dal',data,re.IGNORECASE | re.DOTALL) if next: item = self.dir_item() item['type'] = 'next' item['url'] = next.group('url') result.append(item) return result def resolve(self,item,captcha_cb=None,select_cb=None): item = item.copy() util.init_urllib() url = self._url(item['url']) page = '' try: opener = urllib2.OpenerDirector() opener.add_handler(urllib2.HTTPHandler()) opener.add_handler(urllib2.UnknownHandler()) urllib2.install_opener(opener) request = urllib2.Request(url) request.add_header('User-Agent',util.UA) response= urllib2.urlopen(request) page = response.read() response.close() except urllib2.HTTPError, e: traceback.print_exc() return data = util.substr(page,'<form method=post target=\"iframe_dwn\"','</form>') action = re.search('action=(?P<url>[^>]+)',data,re.IGNORECASE | re.DOTALL) img = re.search('<img src=\"(?P<url>[^\"]+)',data,re.IGNORECASE | re.DOTALL) if img and action: sessid=[] for cookie in re.finditer('(PHPSESSID=[^\;]+)',response.headers.get('Set-Cookie'),re.IGNORECASE | re.DOTALL): sessid.append(cookie.group(1)) # we have to download image ourselves image = util.request(self._url(img.group('url')),headers={'Referer':url,'Cookie':sessid[-1]}) img_file = os.path.join(self.tmp_dir,'captcha.png') util.save_data_to_file(image,img_file) code = None if captcha_cb: code = captcha_cb({'id':'0','img':img_file}) if not code: self.info('No captcha received, exit') return request = urllib.urlencode({'code':code}) req = urllib2.Request(self._url(action.group('url')),request) req.add_header('User-Agent',util.UA) req.add_header('Referer',url) req.add_header('Cookie',sessid[-1]) try: resp = urllib2.urlopen(req) if resp.code == 302: file_url = resp.headers.get('location') else: file_url = resp.geturl() if file_url.find(action.group('url')) > 0: msg = resp.read() resp.close() js_msg = re.search('alert\(\'(?P<msg>[^\']+)',msg,re.IGNORECASE | re.DOTALL) if js_msg: raise ResolveException(js_msg.group('msg')) self.error(msg) raise ResolveException('Nelze ziskat soubor, zkuste to znovu') resp.close() if file_url.find('data') >=0 or file_url.find('download_free') > 0: item['url'] = file_url return item self.error('wrong captcha, retrying') return self.resolve(item,captcha_cb,select_cb) except urllib2.HTTPError: traceback.print_exc() return
from bs4 import BeautifulSoup import requests from util import save_data_to_file, extract_data url = "http://www.dialadeliverykenya.co.ke/chicken-inn-menu" json_file = "chicken_inn.json" page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') items = soup.find_all('div', class_='tab-inner chicken-padder') structured_items = [extract_data(item, url) for item in items] save_data_to_file(structured_items, json_file)