def post(self, url, post_params={}): request = Request(url, urllib.urlencode(post_params), timeout=browser_url_open_timeout) print 'dacbrowser post call' #print 'post, type(request): ', type(request) #print 'dir(request): ', dir(request) #print 'request timeout: ', request.timeout #print 'dir(request.timeout): ', dir(request.timeout) request.add_header('User-agent', self.useragent) return self.__open__(request)
def get_urls(br, tokens): from urllib import quote_plus from mechanize import Request from lxml import html escaped = [quote_plus(x.encode('utf-8')) for x in tokens if x and x.strip()] q = b'+'.join(escaped) url = 'http://bigbooksearch.com/books/'+q br.open(url).read() req = Request('http://bigbooksearch.com/query.php?SearchIndex=books&Keywords=%s&ItemPage=1'%q) req.add_header('X-Requested-With', 'XMLHttpRequest') req.add_header('Referer', url) raw = br.open(req).read() root = html.fromstring(raw.decode('utf-8')) urls = [i.get('src') for i in root.xpath('//img[@src]')] return urls
def _checkStoredInjections(self): for r in self.results: # At this state injections in Result obj are not # compacted yet so it will only be 1st injected param url, data = r.target.getPayloadedUrl(r.first_param, "") # In case of proxy if self.engine.getOption('http-proxy') is not None: proxy = ProxyHandler({'http': self.engine.getOption('http-proxy')}) opener = build_opener(proxy) install_opener(opener) # Some headers if self.engine.getOption('ua') is not None: if self.engine.getOption('ua') is "RANDOM": headers = {'User-Agent': random.choice(USER_AGENTS)} else: headers = {'User-Agent': self.engine.getOption('ua')} else: headers = {} if self.engine.getOption("cookie") is not None: headers["Cookie"] = self.engine.getOption("cookie") # Build the request req = Request(url, data, headers) try: to = 10 if self.engine.getOption('http-proxy') is None else 20 response = urlopen(req, timeout=to) except HTTPError, e: self._addError(e.code, r.target.getAbsoluteUrl()) continue except URLError, e: self._addError(e.reason, r.target.getAbsoluteUrl()) continue
def _performInjections(self, target): # Check every parameter for k, v in target.params.iteritems(): pl = Payload(taint=True) url, data = target.getPayloadedUrl(k, pl.payload) # In case of proxy if self.engine.getOption('http-proxy') is not None: proxy = ProxyHandler({'http': self.engine.getOption('http-proxy')}) opener = build_opener(proxy) install_opener(opener) # Some headers if self.engine.getOption('ua') is not None: if self.engine.getOption('ua') is "RANDOM": headers = {'User-Agent': random.choice(USER_AGENTS)} else: headers = {'User-Agent': self.engine.getOption('ua')} else: headers = {} if self.engine.getOption("cookie") is not None: headers["Cookie"] = self.engine.getOption("cookie") # Build the request req = Request(url, data, headers) try: to = 10 if self.engine.getOption('http-proxy') is None else 20 response = urlopen(req, timeout=to) except HTTPError, e: self._addError(e.code, target.getAbsoluteUrl()) return except URLError, e: self._addError(e.reason, target.getAbsoluteUrl()) return
def request(self, method: str, path: str, data=None, headers=None): """Creates a request against the Remarkable Cloud API This function automatically fills in the blanks of base url & authentication. Args: method: The request method. path: complete url or path to request. data: raw data to put/post/... body: the body to request with. This will be converted to json. headers: a dict of additional headers to add to the request. params: Query params to append to the request. stream: Should the response be a stream? Returns: A Response instance containing most likely the response from the server. """ if headers is None: headers = {} if not path.startswith("http"): if not path.startswith('/'): path = '/' + path url = f"{BASE_URL}{path}" else: url = path _headers = { "user-agent": USER_AGENT, } if self.token_set["usertoken"]: token = self.token_set["usertoken"] _headers["Authorization"] = f"Bearer {token}" for k in headers.keys(): _headers[k] = headers[k] log.debug(url, _headers) # import logging # import sys # logger = logging.getLogger("mechanize") # logger.addHandler(logging.StreamHandler(sys.stdout)) # logger.setLevel(logging.DEBUG) # self.browser.set_debug_http(True) # self.browser.set_debug_responses(True) # self.browser.set_debug_redirects(True) req = Request(url, method=method, data=data, headers=_headers) resp = self.browser.open(req) return Response(resp)
def markEpisode(self, episode): values = {"watched": "adding", "shid": episode} data = urllib.urlencode(values) req = Request(self._urlWacthed, " ") req.add_header( "User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.7) Gecko/20100713 Firefox/3.6.7" ) req.add_header("Referer", self._urlBase) req.add_data(data) self._cookieJar.add_cookie_header(req) res = urlopen(req)
def markEpisode(self,episode): values = {"watched": "adding", "shid": episode} data = urllib.urlencode(values) req = Request(self._urlWacthed, " ") req.add_header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.7) Gecko/20100713 Firefox/3.6.7") req.add_header("Referer", self._urlBase) req.add_data(data) self._cookieJar.add_cookie_header(req) res = urlopen(req)
def _open_with_mechanize(self, url, data=None, referer=False): """Opens an internal request with the mechanize library. Since the request is internally dispatched, no open server port is required. :param url: A full qualified URL. :type url: string :param data: A dict with data which is posted using a `POST` request. :type data: dict :param referer: Sets the referer when set to ``True``. :type referer: Boolean (Default ``False``) """ args = locals().copy() del args['self'] preserved_request = getRequest() self.previous_request = ('_open_with_mechanize', args) self.previous_url = self.url if isinstance(url, Request): request = url else: data = self._prepare_post_data(data) request = Request(url, data) referer_url = ' ' if referer: if referer is True and self.url: referer_url = self.url elif isinstance(referer, (str, unicode)): referer_url = referer request.add_header('REFERER', referer_url) request.add_header('HTTP_REFERER', referer_url) try: self.response = self.get_mechbrowser().open(request) except: self.response = None raise self.parse(self.response) self.previous_request_library = LIB_MECHANIZE setRequest(preserved_request)
def search(query, max_results=10, timeout=60): url = 'http://woblink.com/publication/ajax?mode=none&query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser(user_agent='CalibreCrawler/1.0') br.set_handle_gzip(True) rq = Request(url, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Referrer':'http://woblink.com/ebooki-kategorie', 'Cache-Control':'max-age=0', }, data=urllib.urlencode({ 'nw_filtry_filtr_zakrescen_formularz[min]':'0', 'nw_filtry_filtr_zakrescen_formularz[max]':'350', })) r = br.open(rq) raw = r.read() doc = html.fromstring('<html><body>' + raw.decode('utf-8') + '</body></html>') counter = max_results for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka ebook " or @class="nw_katalog_lista_ksiazka ebook promocja"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()')) price = ''.join(data.xpath('.//div[@class="nw_opcjezakupu_cena"]/span[2]/text()')) formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()')) s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats counter -= 1 s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED yield s
def remote_run(self, name, m, *args): from mechanize import HTTPError, Request from calibre.utils.serialize import msgpack_loads, msgpack_dumps url = self.url + '/cdb/cmd/{}/{}'.format(name, getattr(m, 'version', 0)) if self.library_id: url += '?' + urlencode({'library_id':self.library_id}) rq = Request(url, data=msgpack_dumps(args), headers={'Accept': MSGPACK_MIME, 'Content-Type': MSGPACK_MIME}) try: res = self.br.open_novisit(rq, timeout=self.timeout) ans = msgpack_loads(res.read()) except HTTPError as err: self.interpret_http_error(err) raise if 'err' in ans: if ans['tb']: prints(ans['tb']) raise SystemExit(ans['err']) return ans['result']
def download(url, serialize=Constants.picture_serialization): """ Download the given url and serialize depending on the parameter """ try: if url is None or not isinstance(url, six.types.StringTypes): print("No url was given to download") return (False, "No url was given to download", url) target = Constants.get_output_for_url(url) if verbose: print(u"[Process: {}] - Downl. url {} - {}" .format(multiprocessing.current_process(), url, target)) req = Request(url) web_file = urlopen(req) if serialize: with open(target, "wb") as handle: handle.write(web_file.read()) else: # For performance measurements web_file.read() return (True, None, url) except Exception as e: print(e) return (False, e, url)
def get_basic_data(browser, log, *skus): from calibre.utils.date import parse_only_date from mechanize import Request zeroes = ','.join('0' for sku in skus) data = { 'skus': ','.join(skus), 'drc': zeroes, 'startPosition': '0', 'sequence': '1', 'selected': zeroes, 'itemID': '0', 'orderID': '0', 'mailingID': '', 'tContentWidth': '926', 'originalOrder': ','.join(str(i) for i in range(len(skus))), 'selectedOrderID': '0', 'selectedSortColumn': '0', 'listType': '1', 'resultType': '32', 'blockView': '1', } items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx' req = Request(items_data_url, data) response = browser.open_novisit(req) raw = response.read() root = parse_html(raw) for item in root.xpath('//div[@data-priority]'): row = item.getparent().getparent() sku = item.get('id').split('-')[-1] isbns = [ x.strip() for x in row.xpath( 'descendant::*[contains(@class, "pev_sku")]/text()')[0].split( ',') if check_isbn(x.strip()) ] isbns.sort(key=len, reverse=True) try: tags = [ x.strip() for x in astext( row.xpath( 'descendant::*[contains(@class, "pev_categories")]') [0]).split('/') ] except IndexError: tags = [] rating = 0 for bar in row.xpath( 'descendant::*[contains(@class, "bgdColorCommunity")]/@style'): m = re.search('width: (\d+)px;.*max-width: (\d+)px', bar) if m is not None: rating = float(m.group(1)) / float(m.group(2)) break try: pubdate = parse_only_date(astext( row.xpath('descendant::*[contains(@class, "pev_shipDate")]') [0]).split(':')[-1].split(u'\xa0')[-1].strip(), assume_utc=True) except Exception: log.exception('Error parsing published date') pubdate = None authors = [] for x in [ x.strip() for x in row.xpath( 'descendant::*[contains(@class, "pev_contributor")]/@title' ) ]: authors.extend(a.strip() for a in x.split(',')) entry = { 'sku': sku, 'cover': row.xpath('descendant::img/@src')[0].split('?')[0], 'publisher': astext( row.xpath('descendant::*[contains(@class, "headerPublisher")]') [0]), 'title': astext(row.xpath('descendant::*[@id="title_{}"]'.format(sku))[0]), 'authors': authors, 'isbns': isbns, 'tags': tags, 'pubdate': pubdate, 'format': ' '.join( row.xpath( 'descendant::*[contains(@class, "pev_format")]/text()')). strip(), 'rating': rating, } if entry['cover'].startswith('/'): entry['cover'] = None yield entry
import sys from mechanize import ParseResponse,urlopen,urljoin,Request,Browser if len(sys.argv)==1: uri="http://imdb.com" else: uri=sys.argv[1] req=Request(uri) req.set_proxy("208.232.182.74:80","http") response=urlopen(req) forms=ParseResponse(response,backwards_compat=False) form=forms[0] form["q"]="fatih akin" br=Browser() br.open(form.click()) print br.title() for link in br.links(): if str(link.text).count('Fatih')>0: print link.absolute_url +":" +link.text