def test_cookies(self): import urllib2 # this test page depends on cookies, and an http-equiv refresh #cj = CreateBSDDBCookieJar("/home/john/db.db") cj = CookieJar() handlers = [ HTTPCookieProcessor(cj), HTTPRefreshProcessor(max_time=None, honor_time=False), HTTPEquivProcessor(), HTTPRedirectHandler(), # needed for Refresh handling in 2.4.0 # HTTPHandler(True), # HTTPRedirectDebugProcessor(), # HTTPResponseDebugProcessor(), ] o = apply(build_opener, handlers) try: install_opener(o) try: r = urlopen(urljoin(self.uri, "/cgi-bin/cookietest.cgi")) except urllib2.URLError, e: #print e.read() raise data = r.read() #print data self.assert_( data.find("Your browser supports cookies!") >= 0) self.assert_(len(cj) == 1) # test response.seek() (added by HTTPEquivProcessor) r.seek(0) samedata = r.read() r.close() self.assert_(samedata == data)
def _retrieve_product(cls, url): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) browser = mechanize.Browser() product_data = browser.open(url).get_data() soup = BeautifulSoup(product_data) product_name = soup.find('h1').string.encode('ascii', 'ignore') product_price = soup.find('span', {'id': 'product_price'}) product_price = Decimal(clean_price_string(product_price.string)) payment_methods = ['cash', 'deposit', 'wire_transfer'] additional_data = soup.find('td', 'descr').findAll('h3') if not additional_data: payment_methods.extend(['debit_card', 'credit_card']) elif additional_data[0].string and 'Contado' not in \ additional_data[0].string: payment_methods.extend(['debit_card', 'credit_card']) prices = {} for p in payment_methods: prices[p] = product_price return [product_name, prices]
def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent","Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen("https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1") # use your group fp.readlines() fp.close()
def _checkStoredInjections(self): for r in self.results: # At this state injections in Result obj are not # compacted yet so it will only be 1st injected param url, data = r.target.getPayloadedUrl(r.first_param, "") # In case of proxy if self.engine.getOption('http-proxy') is not None: proxy = ProxyHandler({'http': self.engine.getOption('http-proxy')}) opener = build_opener(proxy) install_opener(opener) # Some headers if self.engine.getOption('ua') is not None: if self.engine.getOption('ua') is "RANDOM": headers = {'User-Agent': random.choice(USER_AGENTS)} else: headers = {'User-Agent': self.engine.getOption('ua')} else: headers = {} if self.engine.getOption("cookie") is not None: headers["Cookie"] = self.engine.getOption("cookie") # Build the request req = Request(url, data, headers) try: to = 10 if self.engine.getOption('http-proxy') is None else 20 response = urlopen(req, timeout=to) except HTTPError, e: self._addError(e.code, r.target.getAbsoluteUrl()) continue except URLError, e: self._addError(e.reason, r.target.getAbsoluteUrl()) continue
def init(self): br = mechanize.Browser() br.set_handle_robots(False) self.cj = mechanize.LWPCookieJar() br.set_cookiejar(self.cj) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.open("https://www.tumblr.com/login") br.select_form(nr=0) br['user[email]'] = "" br['user[password]'] = "" url, data, hdrs = br.form.click_request_data() br.open("https://www.tumblr.com/login", data) self.nf = 0 opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(self.cj)) mechanize.install_opener(opener) self._fetch()
def _performInjections(self, target): # Check every parameter for k, v in target.params.iteritems(): pl = Payload(taint=True) url, data = target.getPayloadedUrl(k, pl.payload) # In case of proxy if self.engine.getOption('http-proxy') is not None: proxy = ProxyHandler({'http': self.engine.getOption('http-proxy')}) opener = build_opener(proxy) install_opener(opener) # Some headers if self.engine.getOption('ua') is not None: if self.engine.getOption('ua') is "RANDOM": headers = {'User-Agent': random.choice(USER_AGENTS)} else: headers = {'User-Agent': self.engine.getOption('ua')} else: headers = {} if self.engine.getOption("cookie") is not None: headers["Cookie"] = self.engine.getOption("cookie") # Build the request req = Request(url, data, headers) try: to = 10 if self.engine.getOption('http-proxy') is None else 20 response = urlopen(req, timeout=to) except HTTPError, e: self._addError(e.code, target.getAbsoluteUrl()) return except URLError, e: self._addError(e.reason, target.getAbsoluteUrl()) return
def readUrl(inUrl): tryCount = 0 while tryCount < 5 : # print "Create CookieJar" cookies = mechanize.CookieJar() # print "Build Opener" opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # print "Add Headers" opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"),("From", "*****@*****.**")] # print "Install Opener" mechanize.install_opener(opener) try: # print "Open URL" response = mechanize.urlopen(inUrl) tryCount = 99 except: tryCount += 1 print "******** Error on urlopen ***********" print "URL: ", inUrl print "Trying Again....", tryCount # print response.read() # html = urllib.urlopen(inUrl).read() # print "Reading Response" html = response.read() # print "Response Read:", html[0:100] root = lxml.html.fromstring(html) # print "Root created: ", root return root
def test_cookies(self): import urllib2 # this test page depends on cookies, and an http-equiv refresh #cj = CreateBSDDBCookieJar("/home/john/db.db") cj = CookieJar() handlers = [ HTTPCookieProcessor(cj), HTTPRefreshProcessor(max_time=None, honor_time=False), HTTPEquivProcessor(), HTTPRedirectHandler(), # needed for Refresh handling in 2.4.0 # HTTPHandler(True), # HTTPRedirectDebugProcessor(), # HTTPResponseDebugProcessor(), ] o = apply(build_opener, handlers) try: install_opener(o) try: r = urlopen(urljoin(self.uri, "/cgi-bin/cookietest.cgi")) except urllib2.URLError, e: #print e.read() raise data = r.read() #print data self.assert_(data.find("Your browser supports cookies!") >= 0) self.assert_(len(cj) == 1) # test response.seek() (added by HTTPEquivProcessor) r.seek(0) samedata = r.read() r.close() self.assert_(samedata == data)
def customizeUserAgent(): import mechanize cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # Pretend to be Chrome to avoid getting the mobile site. opener.addheaders = [("User-agent", "Chrome/16.0.912.63")] mechanize.install_opener(opener)
def readUrl(inUrl): tryCount = 0 while tryCount < 5: # print "Create CookieJar" cookies = mechanize.CookieJar() # print "Build Opener" opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # print "Add Headers" opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"), ("From", "*****@*****.**")] # print "Install Opener" mechanize.install_opener(opener) try: # print "Open URL" response = mechanize.urlopen(inUrl) tryCount = 99 except: tryCount += 1 print "******** Error on urlopen ***********" print "URL: ", inUrl print "Trying Again....", tryCount # print response.read() # html = urllib.urlopen(inUrl).read() # print "Reading Response" html = response.read() # print "Response Read:", html[0:100] root = lxml.html.fromstring(html) # print "Root created: ", root return root
def retrieve_product_data(self, product_link): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) browser = mechanize.Browser() product_data = browser.open(product_link).get_data() soup = BeautifulSoup(product_data) product_name = soup.find('title').string.encode('ascii', 'ignore') product_prices = soup.find('div', 'price').contents try: cash_price = int(clean_price_string(product_prices[4])) product_data = ProductData() product_data.custom_name = product_name product_data.price = cash_price product_data.url = product_link product_data.comparison_field = product_link return product_data except IndexError: return None
def retrieve_product_links(self): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) url_base = 'http://www.globalmac.cl/' browser = mechanize.Browser() url_extensions = [ ['Distribuidor-Apple-Chile/MacBook-Air', 'Notebook'], ['Distribuidor-Apple-Chile/MacBook-Pro', 'Notebook'], ['Hardware-Mac-PC/Discos-Duros-Notebook-SATA-2.5', 'StorageDrive'], ['Hardware-Mac-PC/Discos-Duros-SATA-3.5', 'StorageDrive'], ['Hardware-Mac-PC/Discos-Duros-SSD-SATA-2.5', 'StorageDrive'], ] product_links = [] for url_extension, ptype in url_extensions: url = url_base + url_extension base_data = browser.open(url).get_data() soup = BeautifulSoup(base_data) for item in soup.findAll('div', 'name'): product_links.append([item.find('a')['href'], ptype]) return product_links
def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen( "https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1" ) # use your group fp.readlines() fp.close()
def __init__(self, username="******", password="******"): self.username = "******"+username self.password = password self.password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() ntlm_auth = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(self.password_manager) opener = mechanize.build_opener(ntlm_auth) mechanize.install_opener(opener)
def GetHtml(url): opener = mechanize.build_opener() opener.addheaders = [("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0) Gecko/20100101 Firefox/4.0")] mechanize.install_opener(opener) request = mechanize.urlopen(url) html = request.read() request.close() return html
def themain(): #browser=mechanize.Browser() #browser.open('http://www.baidu.com') cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) r = mechanize.urlopen('http://www.baidu.com') cj.save('cookie.txt', ignore_discard=True, ignore_expires=True)
def __init__(self, username="******", password="******"): self.username = "******" + username self.password = password self.password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() ntlm_auth = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler( self.password_manager) opener = mechanize.build_opener(ntlm_auth) mechanize.install_opener(opener)
def setup_mechanize(): """ Set up user agent for all mechanize calls. """ cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) homepage = "http://github.com/aszlig/picfetcher" opener.addheaders = [("User-agent", "PicFetcher/0.1.0 (+%s)" % homepage)] mechanize.install_opener(opener)
def openUrl(url, cookie=None, login=False): """ Opens a given url through mechanize. If there is no cookie (string path) passed in or if there is a cooke path passed in but the login parameter is False (signifying to open the url with cookie saved in the cookie path), the html from the opened url is returned as a string. If a cookie path is passed in and the login parameter is True, then the Mechanize.Broswer object is returned to perform a yogaglo login through a form submission. """ browser = mechanize.Browser() browser.addheaders = [ ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:24.0) Gecko/20100101 Firefox/24.0'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Language', 'en-gb,en;q=0.5'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'), ('Keep-Alive', '115'), ('Connection', 'keep-alive'), ('Cache-Control', 'max-age=0'), ] #Experimental? # browser.set_handle_gzip(True) browser.set_handle_redirect(True) browser.set_handle_referer(True) browser.set_handle_robots(False) browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time = 1) if not cookie is None: cj = cookielib.LWPCookieJar() browser.set_cookiejar(cj) opener = mechanize.build_opener(HTTPCookieProcessor(cj)) mechanize.install_opener(opener) # trying to login, no cookie, must return browser so it can follow the # login url if login is True: browser.open(url) return browser # can't set to expire, can't read when this particular cookie expires cj.load(cookie , ignore_discard=True) return browser.open(url).read()
def initialize_browser(): """Configurações para contornar os cookies, robots.txt e outros para fingir ser um browser normal.""" cookiejar = cookielib.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookiejar)) mechanize.install_opener(opener) browser = mechanize.Browser() browser.set_handle_robots(False) browser.set_handle_redirect(True) browser.set_cookiejar(cookiejar) browser.set_handle_equiv(True) browser.set_handle_referer(True) browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=2) browser.addheaders = [('User-agent', 'Google Chrome')] return browser, cookiejar
def __init__(self): self.cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj)) mechanize.install_opener(opener) self.br = mechanize.Browser() self.br.set_cookiejar(self.cj) self.sessionkey = 'None' self.br.set_header( 'User-Agent', value= 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0' ) # self.br.set_debug_http(True) self.br.set_debug_redirects(True)
def fillform(self, form, choice,questionid,sessionid,charturl,user,password): if choice != "Random": for i in range(1, 5): form[questionid + str(i)] = [choice] else: for i in range(1, 5): form[questionid + str(i)] = [str(random.randint(1, 5))] data = form.click().get_data() charturl += sessionid + "&questionid=" + questionid + "&qtype=" + "LS" opener = self.addAuthentication(charturl, user, password) mechanize.install_opener(opener) req = mechanize.Request(charturl, data) req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Encoding', 'gzip,deflate')
def setUp(self): mechanize._testcase.TestCase.setUp(self) self.test_uri = urljoin(self.uri, "test_fixtures") self.server = self.get_cached_fixture("server") if self.no_proxies: old_opener_m = mechanize._opener._opener mechanize.install_opener( mechanize.build_opener(mechanize.ProxyHandler(proxies={}))) install_opener(build_opener(ProxyHandler(proxies={}))) def revert_install(): mechanize.install_opener(old_opener_m) install_opener(None) self.add_teardown(revert_install)
def setUp(self): mechanize._testcase.TestCase.setUp(self) self.test_uri = urljoin(self.uri, "test_fixtures") self.server = self.get_cached_fixture("server") if self.no_proxies: old_opener_m = mechanize._opener._opener old_opener_u = urllib2._opener mechanize.install_opener(mechanize.build_opener( mechanize.ProxyHandler(proxies={}))) urllib2.install_opener(urllib2.build_opener( urllib2.ProxyHandler(proxies={}))) def revert_install(): mechanize.install_opener(old_opener_m) urllib2.install_opener(old_opener_u) self.add_teardown(revert_install)
def __init__(self, username, password): mechanize.Browser.__init__(self) cj = mechanize.LWPCookieJar() self.set_cookiejar(cj) self.set_handle_equiv(True) self.set_handle_redirect(True) self.set_handle_referer(True) self.set_handle_robots(False) self.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] self.open(self.base_url) self.username = username self.password = password self.login() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def get_trash_zone(address, zip): #Make cookie jar. See wwwsearch.sourceforge.dat/mechanize/hints.html cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) #Save cookies cj.save( "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar", ignore_discard=True, ignore_expires=True) #Create a browser browser = mechanize.Browser() #Fill in form browser.open('http://lmt-web.lowermerion.org/cgi-bin/refuse2.plx') browser.form = list(browser.forms())[0] browser.form['askrecycle'] = address browser.form['postcode'] = zip #Submit form browser.submit() #Extract content content = browser.response().read() #Use pattern match to extract fields m = re.search('<b>(Monday|Tuesday|Wednesday|Thursday|Friday)</b>', content) if m: day, = m.groups() #Convert day to number day_number = schedule_helpers.get_day_number(day) else: #Failed return m = re.search('<b>Zone ([1-4])</b>', content) if m: zone, = m.groups() else: #Failed return #Match for both day and zone return day_number, zone
def get_trash_zone(address, zip): #Make cookie jar. See wwwsearch.sourceforge.dat/mechanize/hints.html cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) #Create a browser browser = mechanize.Browser() #User-Agent (this is cheating, ok?) browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] #Save cookies cj.save( "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar", ignore_discard=True, ignore_expires=True) #Fill in form #browser.open('http://citymaps.phila.gov/portal/') #browser.select_form(name="form1") #browser.form['txtSearchAddress'] = address #Fill in form #browser.open('https://alpha.phila.gov/property/') #browser.open('http://www.lowermerion.org/cgi-bin/recycle2.plx/') browser.open( 'http://www.lowermerion.org/services/public-works-department/refuse-and-recycling/how-to-determine-your-recycling-collection-day' ) #browser.form = list(browser.forms())[0] #browser.form['askrecycle'] = address #browser.form['postcode'] = zip #Submit form #browser.submit() #Extract content content = browser.response().read() return content
def acm(query_str): acm_url = u"http://dl.acm.org/" cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent","Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen(acm_url) forms = mechanize.ParseResponse(fp, backwards_compat=False) fp.close() #doc = fetch(acm_url) form = forms[0] form['query'] = query_str fp = mechanize.urlopen(form.click()) doc = fp.read() with open("acm.html", 'wb') as fo: fo.write(doc) fp.close()
def _product_urls_and_types(cls, product_types): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) url_base = 'http://www.globalmac.cl/' browser = mechanize.Browser() url_extensions = [ ['MacBook/', 'Notebook'], ['MacBook-Pro/', 'Notebook'], ['Monitores-LCD/', 'Monitor'], ['Cinema-Display/', 'Monitor'], ['Disco-Duro-SATA-2.5/', 'StorageDrive'], ['Discos-Duros-SATA/', 'StorageDrive'], ] if 'Ram' in product_types: memory_catalog_url = url_base + 'Memorias/' base_data = browser.open(memory_catalog_url).get_data() soup = BeautifulSoup(base_data) subcats = soup.findAll('span', 'subcategories') for subcat in subcats: link = subcat.find('a')['href'].replace(url_base, '') url_extensions.append([link, 'Ram']) product_links = [] for url_extension, ptype in url_extensions: if ptype not in product_types: continue base_data = browser.open(url_base + url_extension).get_data() soup = BeautifulSoup(base_data) titles = soup.findAll('a', 'product-title') for title in titles: product_links.append([title['href'], ptype]) return product_links
def __init__(self, username, password): mechanize.Browser.__init__(self) cj = mechanize.LWPCookieJar() self.set_cookiejar(cj) self.set_handle_equiv(True) self.set_handle_redirect(True) self.set_handle_referer(True) self.set_handle_robots(False) self.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] self.open(self.base_url) self.username = username self.password = password self.login() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def go(): ''' Main procedure of the scraper. Creates a browser, load the list of tasks and execute them ''' try: # Prepare the browser cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(opener) br = mechanize.Browser() br.set_handle_robots(False) br.set_handle_refresh(False) br.set_handle_referer(False) br.open("http://www.infogreffe.fr/infogreffe/process.do") # Get the list of tasks tasks = load_task_queue() if len(tasks) == 0: # If there is no task to execute, init/reset the table init_task_queue() tasks = load_task_queue() for task in tasks: try: # Execute the task results = get_companies(br, task['name'], task['dept']) # If we hit the soft limit, add more refined searches to the queue if results == 100: print "Limit reached for %s in %s, adding new tasks" % ( task['name'], task['dept']) expand_task_queue(task['name'], task['dept']) # Mark the task as done mark_task_done(task['name'], task['dept'], results) except Exception as detail: # We may get an exception for using too much CPU time. print "Exception raised", detail except Exception as detail: # If we can't open the browser, just skip running the scraper print "Failed starting browser ", detail
def fillform(self, form, choice, questionid, sessionid, charturl, user, password): if choice != "Random": for i in range(1, 5): form[questionid + str(i)] = [choice] else: for i in range(1, 5): form[questionid + str(i)] = [str(random.randint(1, 5))] data = form.click().get_data() charturl += sessionid + "&questionid=" + questionid + "&qtype=" + "LS" opener = self.addAuthentication(charturl, user, password) mechanize.install_opener(opener) req = mechanize.Request(charturl, data) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' ) req.add_header( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Encoding', 'gzip,deflate')
def __init__(self, login=login.facebook): super(KaggLoader, self).__init__() self.login = login self.set_handle_equiv(True) self.set_handle_robots(False) if not os.path.exists(self.BASE_DIR): os.makedirs(self.BASE_DIR) if not os.path.exists(self.COOKIE_PATH): with open(self.COOKIE_PATH, 'w') as f: f.write('#LWP-Cookies-2.0') self.cj = mechanize.LWPCookieJar() self.cj.load(self.COOKIE_PATH, ignore_discard=False, ignore_expires=False) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj)) mechanize.install_opener(opener) self.set_cookiejar(self.cj)
def pay_me_now(username, password): if DEBUG: import sys, logging logger = logging.getLogger("mechanize") logger.addHandler(logging.StreamHandler(sys.stdout)) logger.setLevel(logging.DEBUG) cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; BTCGPayOut/0.1)")] mechanize.install_opener(opener) br = mechanize.Browser() if DEBUG: br.set_debug_http(True) br.set_debug_responses(True) br.set_debug_redirects(True) br.set_handle_robots(False) # login try: br.open(URL_LOGIN) br.select_form(predicate=select_login_form) br['username'] = username br['password'] = password br.submit() br.select_form(predicate=select_pay_me_now_form) except: print "Failed to login" return # logged in try: br.select_form(predicate=select_pay_me_now_form) except: print "Failed to find withdraw form" return br.submit()
def go(): ''' Main procedure of the scraper. Creates a browser, load the list of tasks and execute them ''' try: # Prepare the browser cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(opener) br = mechanize.Browser() br.set_handle_robots(False) br.set_handle_refresh(False) br.set_handle_referer(False) br.open("http://www.infogreffe.fr/infogreffe/process.do") # Get the list of tasks tasks = load_task_queue() if len(tasks) == 0: # If there is no task to execute, init/reset the table init_task_queue() tasks = load_task_queue() for task in tasks: try: # Execute the task results = get_companies(br, task['name'], task['dept']) # If we hit the soft limit, add more refined searches to the queue if results == 100: print "Limit reached for %s in %s, adding new tasks" % (task['name'], task['dept']) expand_task_queue(task['name'], task['dept']) # Mark the task as done mark_task_done(task['name'], task['dept'], results) except Exception as detail: # We may get an exception for using too much CPU time. print "Exception raised", detail except Exception as detail: # If we can't open the browser, just skip running the scraper print "Failed starting browser ", detail
def logIn(self): """ Logs in to private archives using the supplied email and password. Stores the cookie so we can continue to get subsequent pages. """ cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent","Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) self.message('Logging in to '+self.list_url) fp = mechanize.urlopen(self.list_url) forms = ClientForm.ParseResponse(fp, backwards_compat=False) fp.close() form = forms[0] form['username'] = self.username form['password'] = self.password fp = mechanize.urlopen(form.click()) fp.close()
def logIn(self): """ Logs in to private archives using the supplied email and password. Stores the cookie so we can continue to get subsequent pages. """ cookieJar = mechanize.CookieJar() opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) self.message('Logging in to ' + self.list_url) fp = mechanize.urlopen(self.list_url) forms = ClientForm.ParseResponse(fp, backwards_compat=False) fp.close() form = forms[0] form['username'] = self.username form['password'] = self.password fp = mechanize.urlopen(form.click()) fp.close()
def run(self): self.tsession = Session() try: while True: twitter_user_id = self.queue.get() twitter_user = self.tsession.query(TwitterUser).filter(TwitterUser.id == twitter_user_id).first() # Mechanize setup cookies = cookielib.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [("User-agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/533.4 (KHTML, like Gecko) Version/4.1 Safari/533.4") , ] mechanize.install_opener(opener) # Load the twitter page page = None try: tries = 5 while (tries > 0): try: page = mechanize.urlopen("https://mobile.twitter.com/" + twitter_user.twitter_screen_name) except mechanize.HTTPError, e: if e.code != 404: tries = tries - 1 pass else: raise except Exception, e: raise if tries == 0: print "Error 403 para %s" % twitter_user.twitter_screen_name sys.stdout.write("Error 403 para " + twitter_user.twitter_screen_name + "\n") sys.stdout.flush() self.queue.task_done() continue
def getformpage(self, user, password, choice): projectgroupurl = "http://leo.rp.edu.sg//workspace/studentModule.asp?site=" qnnurl = "http://leo3.rp.edu.sg//projectweb/group_evaluation.asp?" quizurl = "http://leo3.rp.edu.sg//projectweb/qnn_take.asp?" sessionurl = "http://leo3.rp.edu.sg//projectweb/qnn_preset.asp?" charturl = "http://leo3.rp.edu.sg//projectweb/response_chart.asp?" saveurl = "http://leo3.rp.edu.sg//projectweb/qnn_save_responses.asp" urllist = [] for i in range(1, 4): urllist.append(projectgroupurl + str(i)) # retrieve the result currentModule = "projectid" currentProblem = "groupid" try: for url in urllist: opener = self.addAuthentication(url, user, password) mechanize.install_opener(opener) response = mechanize.Request(url) page = urlopen(response).read() if ("Wrong Password" in page or "Wrong ID" in page): self.vNP.set("Sorry, USERNAME or PASSWORD wrong!") elif ('''ToggleDisplay''' in page): currentModule = self.getidlist("projectid", page)[-1] currentProblem = self.getidlist("groupid", page)[-1] if (currentModule != "projectid" and currentProblem != "groupid"): getqnnurl = qnnurl + currentModule + "&" + currentProblem + "&lang=ISO-8859-1" opener = self.addAuthentication(getqnnurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(getqnnurl) getqnnpage = urlopen(response) forms = ParseResponse(getqnnpage, backwards_compat=False) form = forms[0] qnnid = form["qnnid"] evalid = form["evalid"] opener = self.addAuthentication(getqnnurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(getqnnurl) getqnnpageread = urlopen(response).read() author_evaluatorlist = re.findall(r"'\d{5}', '.{38}'", getqnnpageread) #for i in range(len(author_evaluatorlist)): authorid = author_evaluatorlist[0][1:6] evaluatorid = author_evaluatorlist[0][10:-1] getsessionurl = sessionurl + "&qnnid=" + qnnid + "&" + currentModule + "&" + currentProblem + "&evalid=" + evalid + "&evaltype=P" + "&authorid=" + authorid + "&evaluatorid=" + evaluatorid + "&lang=ISO-8859-1" opener = self.addAuthentication(getsessionurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(getsessionurl) getqnnpage = urlopen(response) forms = ParseResponse(getqnnpage, backwards_compat=False) form = forms[0] form.set_all_readonly(False) form["qnnid"] = qnnid form["authorid"] = authorid form["evaluatorid"] = evaluatorid form["evaltype"] = "P" form["lang"] = "ISO-8859-1" form["newflag"] = "0" form["evalid"] = evalid form["groupid"] = currentProblem[8:] form["projectid"] = currentModule[10:] submit = form.click() data = submit.get_data() opener = self.addAuthentication(quizurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(quizurl, data) sessionid = self.getidlist("sessionid", urlopen(response).read())[0] answerurl = re.search( "(\<FRAME NAME=\"main\" SRC=\")(.+)(\"\>)", urlopen(response).read()).group(2) answerurl = "http://leo3.rp.edu.sg//projectweb/" + answerurl opener = self.addAuthentication(answerurl, user, password) mechanize.install_opener(opener) rs = mechanize.Request(answerurl, data) quiz = urlopen(rs) quizpage = urlopen(rs).read() questionid = re.search(r"\{.+\}num", quizpage).group()[0:-3] forms = ParseResponse(quiz, backwards_compat=False) form = forms[0] self.fillform(form, choice, questionid, sessionid, charturl, user, password) form.set_all_readonly(False) form["finish"] = "MANUAL" print form ''' data = form.click().get_data() opener = self.addAuthentication(saveurl, user, password) mechanize.install_opener(opener) req = mechanize.Request(saveurl, data) req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Encoding', 'gzip,deflate') print urlopen(req).read() ''' else: self.vNP.set("Sorry, TODAY NO MODULE!") except mechanize.HTTPError, e: self.vNP.set( "Error:", BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code])
def revert_install(): mechanize.install_opener(old_opener_m) install_opener(None)
import mechanize cookies = mechanize.CookieJar() cookie_opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(cookie_opener) url = "http://www.webscantest.com/crosstraining/aboutyou.php" res = mechanize.urlopen(url) content = res.read() print len(content), content[0:100]
def __init__(self, ip): self.ip = ip self.neighbours = {} self.laser_ports = {} self.new_adm = False #print(self.ip, type(self.ip)) self.baseurl = 'http://%s:20080/' % (self.ip) try: br = mechanize.Browser() #Create mechanize browser object #Added false headers try: cookies = mechanize.CookieJar() opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [( "User-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" )] mechanize.install_opener(opener) except Exception as e: print(str(e)) try: if 'TJ1400' in br.open(self.baseurl, timeout=5.0).read(): self.new_adm = True br.form = list(br.forms())[0] controls = list(br.form.controls) controls[0].value = 'tejas' controls[1].value = 'j72e#05t' page = br.submit() self.new_adm = True time.sleep(5) page = br.open(self.baseurl, timeout=5.0).read() except Exception as e: #print("{}-{}".format(str(e), self.ip)) br = mechanize.Browser() br.add_password( self.baseurl, username, passw ) #Get user id and password from command line arguements page = br.open(self.baseurl, timeout=5.0).read() #Check if NE is accessible self.new_adm = False if 'alarmBanner' in page: print "Logged in to %s" % (self.baseurl) loggedIn = self.get_laser_data(br) #Read laser data of STM ports failTime = threading.Thread(target=self.get_fail_time, args=(br, )) failTime.start() #self.get_fail_time(br) #Read alarams (MS DCC Fail only) addNeighbours = threading.Thread(target=self.add_neighbours, args=(br, )) addNeighbours.start() #self.add_neighbours(br) #Add neighbours if loggedIn: self.backup(br) #Backup cross-connect info failTime.join() addNeighbours.join() #print(self.neighbours) if self.alarams_dict: for stm in self.alarams_dict.keys(): if stm in self.neighbours.keys(): fail_node_times = [[ self.ip, self.neighbours[stm][0], self.alarams_dict[stm] ]] fail_times.extend(fail_node_times) except Exception as e: print("\nError reading {} \n-+--+- {} -+--+-".format(ip, str(e))) br.close() return (None)
except urllib2.URLError, e: #print e.read() raise data = r.read() #print data self.assert_(data.find("Your browser supports cookies!") >= 0) self.assert_(len(cj) == 1) # test response.seek() (added by HTTPEquivProcessor) r.seek(0) samedata = r.read() r.close() self.assert_(samedata == data) finally: o.close() install_opener(None) def test_robots(self): plain_opener = mechanize.build_opener( mechanize.HTTPRobotRulesProcessor) browser = mechanize.Browser() for opener in plain_opener, browser: r = opener.open(urljoin(self.uri, "robots")) self.assertEqual(r.code, 200) self.assertRaises(mechanize.RobotExclusionError, opener.open, urljoin(self.uri, "norobots")) def test_urlretrieve(self): url = urljoin(self.uri, "/mechanize/") test_filename = "python.html"
import mechanize cookies = mechanize.CookieJar() cookie_opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(cookie_opener) url = "http://www.webscantest.com/crosstraining/aboutyou.php" res = mechanize.urlopen(url) content = res.read()
def login(self,className): """ Automatically generate a cookie file for the coursera site. """ #TODO: use proxy here hn,fn = tempfile.mkstemp() cookies = cookielib.LWPCookieJar() handlers = [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) url = self.lecture_url_from_name(className) req = urllib2.Request(url) try: res = opener.open(req) except urllib2.HTTPError as e: if e.code == 404: raise Exception("Unknown class %s" % className) # get the csrf token csrfcookie = [c for c in cookies if c.name == "csrf_token"] if not csrfcookie: raise Exception("Failed to find csrf cookie") csrftoken = csrfcookie[0].value opener.close() # call the authenticator url: cj = cookielib.MozillaCookieJar(fn) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler(), urllib2.HTTPSHandler()) opener.addheaders.append(('Cookie', 'csrftoken=%s' % csrftoken)) opener.addheaders.append(('Referer', 'https://www.coursera.org')) opener.addheaders.append(('X-CSRFToken', csrftoken)) req = urllib2.Request(self.LOGIN_URL) data = urllib.urlencode({'email_address': self.username,'password': self.password}) req.add_data(data) try: opener.open(req) except urllib2.HTTPError as e: if e.code == 401: raise Exception("Invalid username or password") # check if we managed to login sessionid = [c.name for c in cj if c.name == "sessionid"] if not sessionid: raise Exception("Failed to authenticate as %s" % self.username) # all should be ok now, mechanize can handle the rest if we give it the # cookies br = mechanize.Browser() #br.set_debug_http(True) #br.set_debug_responses(False) #br.set_debug_redirects(True) br.set_handle_robots(False) br.set_cookiejar(cj) if self.proxy: br.set_proxies({"http":self.proxy}) self.browser = br # also use this cookiejar for other mechanize operations (e.g., urlopen) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def getNewToken(self): import mechanize #@UnresolvedImport br = mechanize.Browser() __addon__ = xbmcaddon.Addon(id='script.facebook.media') cookiesPath = os.path.join(xbmc.translatePath(__addon__.getAddonInfo('profile')),'cache','cookies') LOG('Cookies will be saved to: ' + cookiesPath) cookies = mechanize.LWPCookieJar(cookiesPath) if os.path.exists(cookiesPath): cookies.load() self.cookieJar = cookies opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(opener) br.set_cookiejar(self.cookieJar) br._ua_handlers["_cookies"].cookiejar.clear() br.set_handle_robots(False) agent = 'XBMC/{0} Facebook-Media/{1}'.format(xbmc.getInfoLabel('System.BuildVersion'),self.version) LOG('Setting User Agent: {0}'.format(agent)) br.addheaders = [('User-agent',agent)] scope = '' if self.scope: scope = '&scope=' + self.scope url = 'https://www.facebook.com/dialog/oauth?client_id='+self.client_id+\ '&redirect_uri='+self.redirect+\ '&type=user_agent&display=popup'+scope LOG(url) try: res = br.open(url) html = res.read() except: LOG("ERROR: TOKEN PAGE INITIAL READ") raise script = False try: #check for login form br.select_form(nr=0) LOG("HTML") except: self.genericError() script = True LOG("SCRIPT") if script: #no form, maybe we're logged in and the token is in javascript on the page url = res.geturl() token = self.extractTokenFromURL(url) if not token: token = self.parseTokenFromScript(html) else: try: #fill out the form and submit br['email'] = self.login_email br['pass'] = self.login_pass res = br.submit() url = res.geturl() LOG("FORM") except: LOG("FORM ERROR") raise script = False token = self.extractTokenFromURL(url) html = self.browserRead(res,'-noscript') if not token: #if 'class="checkpoint"' in html: token = self.handleLoginNotificationCrap(br) if not token: script = True if script: LOG("SCRIPT TOKEN") #no token in the url, let's try to parse it from javascript on the page try: __addon__ = xbmcaddon.Addon(id='script.facebook.media') htmlFile = os.path.join(xbmc.translatePath(__addon__.getAddonInfo('profile')),'cache','DEBUG_HTML.html') open(htmlFile,'w').write(html) LOG('html output written to: ' + htmlFile) except: pass token = self.parseTokenFromScript(html) token = urllib.unquote(token.decode('unicode-escape')) if not self.tokenIsValid(token): #if script: LOG("HTML:" + html) return False LOG("\n|--------------------\n|TOKEN: %s\n|--------------------" % token) self.saveToken(token) if self.cookieJar is not None: self.cookieJar.save() return token
def _product_urls_and_types(cls, product_types): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) url_buscar_productos = '/cl/' product_links = [] url_base = 'http://www.dell.com' # Start home if 'Notebook' in product_types: url_extensions = [ 'p/laptops?cat=laptops', ] for url_extension in url_extensions: url_webpage = url_base + url_buscar_productos + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) notebook_lines_container = soup.find('div', 'tabschegoryGroups') notebook_lines = \ notebook_lines_container.findAll('div', recursive=False) notebook_urls = [] for line in notebook_lines: for container in line.findAll('div', 'prodImg'): link = container.find('a')['href'].replace('pd', 'fs') notebook_urls.append(url_base + link) for url in notebook_urls: for url in cls.retrieve_line_links(url): product_links.append([url, 'Notebook']) # Start business url_extensions = [ 'empresas/p/laptops', ] for url_extension in url_extensions: url_webpage = url_base + url_buscar_productos + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) line_links = soup.find('div', 'content').findAll('a') for link in line_links: url = url_base + link['href'] for url in cls.retrieve_enteprise_links(url): product_links.append([url, 'Notebook']) # Start Monitor if 'Monitor' in product_types: url_extensions = [ '/content/products/compare.aspx/19_22widescreen' '?c=cl&cs=cldhs1&l=es&s=dhs', '/content/products/compare.aspx/23_30widescreen' '?c=cl&cs=cldhs1&l=es&s=dhs', '/cl/es/empresas/Monitores/19_15widescreen/cp.aspx' '?refid=19_15widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/22_20widescreen/cp.aspx' '?refid=22_20widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/30_24widescreen/cp.aspx' '?refid=30_24widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/20_19flatpanel/cp.aspx' '?refid=20_19flatpanel&s=bsd&cs=clbsdt1', ] for url_extension in url_extensions: url_webpage = url_base + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) links = soup.findAll('a', 'lnk') for link in links: if 'configure' in link['href']: product_links.append([link['href'], 'Monitor']) return product_links
def revert_install(): mechanize.install_opener(old_opener_m) urllib2.install_opener(old_opener_u)
def getformpage(self, user, password, choice): projectgroupurl = "http://leo.rp.edu.sg//workspace/studentModule.asp?site=" qnnurl = "http://leo3.rp.edu.sg//projectweb/group_evaluation.asp?" quizurl = "http://leo3.rp.edu.sg//projectweb/qnn_take.asp?" sessionurl = "http://leo3.rp.edu.sg//projectweb/qnn_preset.asp?" charturl = "http://leo3.rp.edu.sg//projectweb/response_chart.asp?" saveurl = "http://leo3.rp.edu.sg//projectweb/qnn_save_responses.asp" urllist = [] for i in range (1, 4): urllist.append(projectgroupurl + str(i)) # retrieve the result currentModule = "projectid" currentProblem = "groupid" try: for url in urllist: opener = self.addAuthentication(url, user, password) mechanize.install_opener(opener) response = mechanize.Request(url) page = urlopen(response).read() if ("Wrong Password" in page or "Wrong ID" in page): self.vNP.set("Sorry, USERNAME or PASSWORD wrong!") elif ('''ToggleDisplay''' in page): currentModule = self.getidlist("projectid", page)[-1] currentProblem = self.getidlist("groupid", page)[-1] if (currentModule != "projectid" and currentProblem != "groupid"): getqnnurl = qnnurl + currentModule + "&" + currentProblem + "&lang=ISO-8859-1" opener = self.addAuthentication(getqnnurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(getqnnurl) getqnnpage = urlopen(response) forms = ParseResponse(getqnnpage, backwards_compat=False) form = forms[0] qnnid = form["qnnid"] evalid = form["evalid"] opener = self.addAuthentication(getqnnurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(getqnnurl) getqnnpageread = urlopen(response).read() author_evaluatorlist = re.findall(r"'\d{5}', '.{38}'", getqnnpageread) #for i in range(len(author_evaluatorlist)): authorid = author_evaluatorlist[0][1:6] evaluatorid = author_evaluatorlist[0][10:-1] getsessionurl = sessionurl + "&qnnid=" + qnnid + "&" + currentModule + "&" + currentProblem + "&evalid=" + evalid + "&evaltype=P" + "&authorid=" + authorid + "&evaluatorid=" + evaluatorid + "&lang=ISO-8859-1" opener = self.addAuthentication(getsessionurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(getsessionurl) getqnnpage = urlopen(response) forms = ParseResponse(getqnnpage, backwards_compat=False) form = forms[0] form.set_all_readonly(False) form["qnnid"] = qnnid form["authorid"] = authorid form["evaluatorid"] = evaluatorid form["evaltype"] = "P" form["lang"] = "ISO-8859-1" form["newflag"] = "0" form["evalid"] = evalid form["groupid"] = currentProblem[8:] form["projectid"] = currentModule[10:] submit = form.click() data = submit.get_data() opener = self.addAuthentication(quizurl, user, password) mechanize.install_opener(opener) response = mechanize.Request(quizurl, data) sessionid = self.getidlist("sessionid", urlopen(response).read())[0] answerurl = re.search("(\<FRAME NAME=\"main\" SRC=\")(.+)(\"\>)", urlopen(response).read()).group(2) answerurl = "http://leo3.rp.edu.sg//projectweb/" + answerurl opener = self.addAuthentication(answerurl, user, password) mechanize.install_opener(opener) rs = mechanize.Request(answerurl, data) quiz = urlopen(rs) quizpage = urlopen(rs).read() questionid = re.search(r"\{.+\}num", quizpage).group()[0:-3] forms = ParseResponse(quiz, backwards_compat=False) form = forms[0] self.fillform(form, choice,questionid,sessionid,charturl,user,password) form.set_all_readonly(False) form["finish"] = "MANUAL" print form ''' data = form.click().get_data() opener = self.addAuthentication(saveurl, user, password) mechanize.install_opener(opener) req = mechanize.Request(saveurl, data) req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Encoding', 'gzip,deflate') print urlopen(req).read() ''' else: self.vNP.set("Sorry, TODAY NO MODULE!") except mechanize.HTTPError, e: self.vNP.set("Error:",BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code])
def login(self,className): """ Login into coursera and obtain the necessary session cookies. """ hn,fn = tempfile.mkstemp() cookies = cookielib.LWPCookieJar() handlers = [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] # prepend a proxy handler if defined if(self.proxy): proxy = urllib2.ProxyHandler({'http': self.proxy}) handlers = [proxy] + handlers opener = urllib2.build_opener(*handlers) url = self.lecture_url_from_name(className) req = urllib2.Request(url) try: res = opener.open(req) except urllib2.HTTPError as e: if e.code == 404: raise Exception("Unknown class %s" % className) # get the csrf token csrfcookie = [c for c in cookies if c.name == "csrf_token"] if not csrfcookie: raise Exception("Failed to find csrf cookie") csrftoken = csrfcookie[0].value opener.close() # call the authenticator url: cj = cookielib.MozillaCookieJar(fn) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler(), urllib2.HTTPSHandler()) opener.addheaders.append(('Cookie', 'csrftoken=%s' % csrftoken)) opener.addheaders.append(('Referer', 'https://accounts.coursera.org/signin')) opener.addheaders.append(('X-CSRFToken', csrftoken)) req = urllib2.Request(self.LOGIN_URL) data = urllib.urlencode({'email': self.username,'password': self.password}) req.add_data(data) try: opener.open(req) except urllib2.HTTPError as e: if e.code == 401: raise Exception("Invalid username or password") # check if we managed to login sessionid = [c.name for c in cj if c.name == "CAUTH"] if not sessionid: raise Exception("Failed to authenticate as %s" % self.username) # all should be ok now, mechanize can handle the rest if we give it the # cookies br = mechanize.Browser() #br.set_debug_http(True) #br.set_debug_responses(False) #br.set_debug_redirects(True) br.set_handle_robots(False) br.set_cookiejar(cj) if self.proxy: br.set_proxies({"http":self.proxy}) self.browser = br # also use this cookiejar for other mechanize operations (e.g., urlopen) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def retrieve_product_links(self): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) url_buscar_productos = '/cl/' product_links = [] url_base = 'http://www.dell.com' # Start home url_extensions = [ 'p/laptops?cat=laptops', ] for url_extension in url_extensions: url_webpage = url_base + url_buscar_productos + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) notebook_lines_container = soup.find('div', 'tabschegoryGroups') notebook_lines = notebook_lines_container.findAll('div', recursive=False) notebook_urls = [] for line in notebook_lines: for container in line.findAll('div', 'prodImg'): link = container.find('a')['href'].replace('pd', 'fs') notebook_urls.append(url_base + link) for url in notebook_urls: for url in self.retrieve_line_links(url): product_links.append([url, 'Notebook']) # Start business url_extensions = [ 'empresas/p/laptops', ] for url_extension in url_extensions: url_webpage = url_base + url_buscar_productos + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) product_containers = soup.findAll('div', 'carouselProduct') for container in product_containers: url = url_base + container.find('a')['href'] for url in self.retrieve_enteprise_links(url): product_links.append([url, 'Notebook']) # Start Monitor url_extensions = [ '/content/products/compare.aspx/19_22widescreen' '?c=cl&cs=cldhs1&l=es&s=dhs', '/content/products/compare.aspx/23_30widescreen' '?c=cl&cs=cldhs1&l=es&s=dhs', '/cl/es/empresas/Monitores/19_15widescreen/cp.aspx' '?refid=19_15widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/22_20widescreen/cp.aspx' '?refid=22_20widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/30_24widescreen/cp.aspx' '?refid=30_24widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/20_19flatpanel/cp.aspx' '?refid=20_19flatpanel&s=bsd&cs=clbsdt1', ] for url_extension in url_extensions: url_webpage = url_base + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) links = soup.findAll('a', {'class': 'lnk'}) for link in links: if 'configure' in link['href']: product_links.append([link['href'], 'Screen']) return product_links