def rtnHTMLformat(tmpddGenrcgenPresent, sppPrefx, pthwcod, ouPthwpng): inpx = '\n'.join(tmpddGenrcgenPresent) # inpx="ALDH2 color \nALDH3A1 color" request = mechanize.Request( "http://www.genome.jp/kegg/tool/map_pathway2.html") response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] form["unclassified"] = inpx form["org"] = sppPrefx request2 = form.click() response2 = mechanize.urlopen(request2) a = str(response2.read()).split('href="/kegg-bin/show_pathway?')[1] code = a.split('/')[0] # response2.read() request = mechanize.Request( "http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args" % (code, pthwcod)) # request=mechanize.Request("http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args"%('13171478854246','hsa00410')) response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[1] status = ' NOT ' try: imgf = str(forms[1]).split('/mark_pathway')[1].split('/')[0] os.system("wget --quiet http://www.genome.jp/tmp/mark_pathway%s/%s.png -O %s" % (imgf, pthwcod, ouPthwpng)) status = ' ' except: pass return 'A pathway image was%ssuccefully produced...' % status
def getScheduleByCSV(self, userid, startday, period): _res = self.br.open("http://" + self.HOSTNAME + "/cgi-def/dnet/dnet.cgi?page=schpsetexport") _forms = mechanize.ParseResponse(_res) _form = _forms[0] _form["uid"] = [userid] forms = mechanize.ParseResponse( self.br.open(_form.click(name='s_add', nr=0))) form = forms[0] ## 開始日付をセット sdate = datetime.datetime.strptime(startday, '%Y-%m-%d') form["syear"] = [sdate.strftime("%Y")] form["smonth"] = [sdate.strftime("%m")] form["sday"] = [sdate.strftime("%d")] ## 終了日付をセット edate = sdate + timedelta(days=period) form["eyear"] = [edate.strftime("%Y")] form["emonth"] = [edate.strftime("%m")] form["eday"] = [edate.strftime("%d")] response = self.br.open(form.click(name="s_ok", nr=0)) return response.read()
def login_to_deviantart(self, credentials): print("Logging in") retry_count = 0 while 1: try: response = self.agent.open(self.HOME_URL) lda = lambda i: (hasattr(i, "attrs") and "id" in i.attrs and i. attrs["id"] == "login") for f in filter(lda, mechanize.ParseResponse(response)): f["username"] = credentials["deviantart.com"][0] f["password"] = credentials["deviantart.com"][2] f.click() # # dunno how to translate this to py mechanize: """ if len(self.agent.cookie_jar) < 3: print ("Log on unsuccessful (maybe wrong login/pass combination?)") print ("You might not be able to fetch the age restricted resources") else: print ("Log on successful") self.agent.pluggable_parser.default = mechanize.Download """ except Exception: #i.e. let KeyboardInterrupt through. traceback.print_exc() if retry_count < 3: retry_count += 1 print("Will retry after 1 second") time.sleep(1) continue else: print("Login failed after 3 retries") print( "You might not be able to fetch the age restricted resources" ) break
def getPropertyPins(streetName): url = r'https://taxcommissioner.dekalbcountyga.gov/TaxCommissioner/TCSearch.asp' request = mechanize.Request(url) response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) response.close() form = forms[0] form['StreetName'] = sys.argv[1] propertyList = mechanize.urlopen(form.click()).read() tree = html.fromstring(propertyList) pins = tree.xpath('//tr/td[1]/a/@href') addresses = tree.xpath('//tr/td[1]/a/text()') pinList = [] i = 0 for pin in pins: #print pin newpin = pin.split('=') pinList.append([newpin[3], addresses[i]]) print newpin[3] + '\t' + addresses[i] i = i + 1 return pinList
def get_vorlage(session_id, url): try: response = mechanize.urlopen(mechanize.Request(url)) pprint.pprint(response) except URLError: return forms = mechanize.ParseResponse(response, backwards_compat=False) for form in forms: # All forms are iterated. Might not all be attachment-related. for control in form.controls: if control.name == 'DT': print control.name, control.value request2 = form.click() try: response2 = mechanize.urlopen(request2) form_url = response2.geturl() if "getfile.asp" in form_url: #print "ERFOLG:", response2.info() pdf = response2.read() md5 = hashlib.md5(pdf).hexdigest() scraperwiki.sqlite.save( unique_keys=['session_id', 'dt', 'md5', 'size'], data={ 'session_id': session_id, 'dt': control.value, 'md5': md5, 'size': len(pdf) }) continue except mechanize.HTTPError, response2: print "HTTP-FEHLER :(" except URLError: pass
def _get_results(form, dbg=False): # click the form clicked_form = form.click() # then get the results page result = mechanize.urlopen(clicked_form) #### EXPORTING RESULTS FILE # so what I do is that I fetch the first results page, # click the form/link to get all hits as a colon separated # ascii table file # get the form resultform = mechanize.ParseResponse(result, backwards_compat=False) result.close() resultform = resultform[0] # set colon as dilimeter of the table (could use anything I guess) #~ resultform.find_control('export_delimiter').items[1].selected = True resultform.find_control('export_delimiter').toggle('colon') resultform_clicked = resultform.click() result_table = mechanize.urlopen(resultform_clicked) data = result_table.read() result_table.close() if dbg: return resultform, result_table, data else: return data
def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen( "https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1" ) # use your group fp.readlines() fp.close()
def uploadFileToAquaforum(uploadFilename, requestedFileName): ''' returns response page ''' # build opener. Can be extended to handle cookies/proxies opener = mechanize.build_opener() # goto upload page request3 = mechanize.Request(FORUM_UPLOAD_URL) response3 = opener.open(request3) # parse form on upload page and add file forms = mechanize.ParseResponse(response3, backwards_compat=False) form = forms[0] filectr = form.find_control("imgfile") # filectr.add_file(open('/home/jasper/avatar.jpg'),"image/jpeg","avatar.jpg") theFile = file(uploadFilename, 'rb') filectr.add_file(theFile, "image/jpeg", os.path.split( requestedFileName)[-1]) # obtain form data request4 = form.click() # urllib2.Request object theFile.close() request4.add_header('Referer', response3.geturl()) response4 = opener.open(request4) return response4.read()
def open_form(self, form_url): self.password_manager.add_password(None, form_url, self.username, self.password) page = mechanize.urlopen(mechanize.Request(form_url)) forms = mechanize.ParseResponse(page, backwards_compat=False) form = forms[ 0] #work in this case, but need to find out a better way like get form by name return form
def secondButtonPress(widget): global flow coursecode_text = objects['coursecode'].get_text() serialno_text = objects['serialno'].get_text() theUrl = bnextUrl + coursecode_text value = 0 ctr = 0 if coursecode_text == 'CSE304': indexes = ['1', '2', '3', '4'] else: indexes = [serialno_text] while True: r = br.open(theUrl) br.select_form(nr=value) br.submit() response = br.response() html = response.read() soup = getsoup(html) if value == 0: for xserialno_text in indexes: tds = [ a.renderContents() for a in soup.findAll('table') [2].findAll('font', attrs={'color': 'black'}) ] index = (eval(xserialno_text) - 1) * 9 + 8 venue = (eval(xserialno_text) - 1) * 9 + 3 print tds[venue], venue if eval(tds[index]) > 0: body = coursecode_text + ":" + serialno_text sendMail(body) value = 1 continue else: ctr += 1 print ctr continue continue if value == 1: tds = soup.findAll('td', attrs={'align': 'center'}) start = 5 index = 0 while start < len(tds): if venue == tds[start]: break start += 5 inputs = soup.findAll('input', attrs={'type': 'radio'}) inp = inputs[index] val = inp['value'] forms = mechanize.ParseResponse(br.response(), backwards_compat=False) form = forms[0] br.select_form(nr=0) br.form.set_value([val], name='clsnbr1') br.submit() print br.response().read() break exit(0)
def main(argv): search = argv # Google Login credentials username = '******' #argv[1] password = '******' #argv[2] # Where to save the CSV file pathname = 'trend_data/' + search + '_trends.csv' #argv[3] queries = ('q=' + query for query in argv[1:]) br = mechanize.Browser() # Create cookie jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Act like we're a real browser br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] # Login in to Google response = br.open( 'https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/' ) forms = mechanize.ParseResponse(response) form = forms[0] form['Email'] = username form['Passwd'] = password response = br.open(form.click()) # Get CSV from Google Trends trends_url = 'http://www.google.com/trends/trendsReport?' query_params = '&'.join(queries) response = br.open(trends_url + query_params + '&export=1') # Remove headers and footers from Google's CSV # Use last date in date range reader = csv.reader(StringIO(response.read())) dates = [] values = [] for row in reader: try: date, value = row except ValueError: continue if re.search('[0-9]{4}-[0-9]{2}-[0-9]{2}', date): dates.append(date[-10:]) # Uses last date in time period values.append(value) with open(pathname, 'w') as f: writer = csv.writer(f) writer.writerow(['Date', search]) for row in zip(dates, values): writer.writerow(row)
def SendRequestToGoogle(self, username, password): br = mechanize.Browser() # Create cookie jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_robots(False) # Act like we're a real browser br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] response = br.open( 'https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/' ) forms = mechanize.ParseResponse(response) form = forms[0] form['Email'] = username form['Passwd'] = password response = br.open(form.click()) keyList = self.queryDict.keys() keyCount = len(keyList) i = 0 TermsDone = open(self.DoneTermsFile, 'ab') print "\n\n log-in success! \n\n" while i < keyCount: SearchTerm = keyList[i] Queries = self.queryDict[SearchTerm] WorldQuery = Queries[0] USQuery = Queries[1] FiscalEnd = Queries[2] # ending month of fiscal year sleep(random.uniform(40, 70)) WorldResponse = br.open(WorldQuery) WorldResult = csv.reader(StringIO(WorldResponse.read())) sleep(random.uniform(30, 60)) USResponse = br.open(USQuery) # searchterm : query USResult = csv.reader(StringIO(USResponse.read())) TempWorld = Queries[3] # temperary file pathes TempUS = Queries[4] # Send contents out for writing intermediate output CSV files W_Error = self.IntermediateCSV(WorldResult, TempWorld) US_Error = self.IntermediateCSV(USResult, TempUS) if W_Error == -1 or US_Error == -1: self.ErrorHandler(SearchTerm) # quota limit else: i = i + 1 # keep track of downloaded CSV files, prevetent repeats TermsDone.write(SearchTerm + ',' + FiscalEnd + '\n') print "%s\t\tDONE" % (SearchTerm) # for monitoring TermsDone.close()
def grab_redirect(link): response = mechanize.urlopen(link['href']) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] data = mechanize.urlopen(form.click()).read() soup = BeautifulSoup.BeautifulSoup(data) for div in soup('div'): if 'class' in dict(div.attrs) and \ div['class'] == 'urlworkaround': txt = ''.join([str(x) for x in div.contents]) lsoup = BeautifulSoup.BeautifulSoup(txt) link = lsoup('a')[0] return link['href'] raise Exception('no href')
def _get_form(): # GET SERVER RESPONSE try: response = mechanize.urlopen(SPLAT_FORM_URL) except mechanize.URLError: raise Exception('No reponse from server : {0}'.format(SPLAT_FORM_URL)) # PARSE SERVER RESPONSE forms = mechanize.ParseResponse(response, backwards_compat=False) response.close() # GET FORM form = forms[0] return form
def start_cloning(options): link = options['link'] user = options['user'] password = options['password'] response = mechanize.urlopen(link) forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] form['txtIdentifiant'] = user form['txtMDP'] = password website = mechanize.urlopen(form.click()) data = website.read() outfile = open('index.html', 'wt') print >> outfile, """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html dir="ltr" lang="fr" xml:lang="fr" xmlns="http://www.w3.org/1999/xhtml" class="yui3-js-enabled" id="yui_3_2_0_1_1326674808791714"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> """ soup = BeautifulSoup.BeautifulSoup(data) title = soup('title') print >> outfile, str(title[0]) divs = soup('div') for div in divs: if 'class' in dict(div.attrs): if div['class'] == 'course-content': vstr = '\n'.join([str(x) for x in div.contents[1:]]) # Eliminate wrong divs lsoup = BeautifulSoup.BeautifulSoup(vstr) for ldiv in lsoup.findAll('div'): if ('class' in dict(ldiv.attrs) and ldiv['class'] in ['left side', 'right side', 'jumpmenu']): ldiv.extract() replace = {} for link in lsoup.findAll('a'): if 'href' in dict(link.attrs): try: replace[link['href']] = grab_redirect(link) except: pass page_txt = str(lsoup) for k, v in replace.items(): nw_key = str(k) + "&redirect=1" page_txt = page_txt.replace(nw_key, str(v)) page_txt = page_txt.replace(str(k), str(v)) print >> outfile, page_txt outfile.close()
def connect_to_form(url, formurl): print "\nConnecting to Web Page...", # Connect to URL (add error handling!!) request = mechanize.Request(mechanize.urljoin(url, formurl)) response = mechanize.urlopen(request) monkeypatch_mechanize() print "Success." # Retrieve forms forms = mechanize.ParseResponse(response, backwards_compat=False) response.close() if len(forms) <= 0: raise FormNotFound('No Forms were found on the web page.') return forms
def login(self,username,password): " Login to Droptask " login_redirect = conf.login_redirect login_url = conf.base_url+login_redirect #Fetch url from conf file and concatenate '/login' to the base url my_params = {'email':username,'password':password} params_encoded = urllib.urlencode(my_params) self.browser.method='POST' headers = {'Host': 'auth.droptask.com','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:46.0) Gecko/20100101 Firefox/46.0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en-US,en;q=0.5','Accept-Encoding': 'gzip, deflate, br','Referer': 'https://auth.droptask.com/login?continue=https%3A%2F%2Fapp.droptask.com%2Fauth&clientId=5c8af5ea-fa7c-4cb3-80e8-04b361d9e297&source=web','Cookie':' _ga=GA1.2.2000288028.1449634785; connect.sid=s%3A08sLYE3zBHEDS4T72OeDhDzqUCW-MNxB.bU6Bdt5xDsHeg6yIyIQKPmL4NG%2BgqRaBOMTMkax44N0','Connection': 'keep-alive'} login_response = self.post(url=login_url,data=params_encoded,headers=headers) forms = mechanize.ParseResponse(login_response, backwards_compat=False) result_flag = False if (len(forms)!= 0) and (forms[0].find_control("password") != None): self.write(" -Login failed") result_flag = False else: self.write(" -Login success") result_flag = True return result_flag
def next_page(ind): #time.sleep(4) br = mechanize.Browser() for attemt in range(5): try: response = br.open(url) break except: pass #br.select_form() forms = mechanize.ParseResponse(response, backwards_compat=False) form = forms[0] form.set_all_readonly(False) form['startitem']=str(ind*5+5) form.set_all_readonly(True) response = form.click() #response1 = br.submit() #print response1 #br.open(mechanize.urlopen(response).read() return mechanize.urlopen(response).read()
def write_page(self, page_name, new_contents, comment=""): """ write raw content to page """ url = self.get_page_url(page_name) response = self.browser.open(url + "?action=edit") forms = mechanize.ParseResponse(response, backwards_compat=False) edit_form = self._find_edit_form(forms) old_contents = edit_form.get_value(name="text") if old_contents.replace("\r\n", "\n") == new_contents.replace("\r\n", "\n"): print "No changes to make" return edit_form.set_value(name="text", value=new_contents) edit_form.set_value(name="comment", value=comment) request = edit_form.click(name="save") response = self.browser.open(request) response_text = response.read() if "Your changes have been saved" not in response_text: #print "Previous content: '%s'" % old_contents #print "Attempted new content: '%s'" % new_contents raise Exception("error writing page")
def RA_prep_search(location, position, date, lenghtOfStay): scrape_url = 'http://www.reserveamerica.com/unifSearchResults.do' req = mechanize.Request(scrape_url) req.add_header("Referer", scrape_url) req.add_header('user-agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36') # req.add_header("Accept-Encoding", "gzip, deflate") req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8') req.add_header('Origin', 'http://www.reserveamerica.com') r1 = mechanize.urlopen(req) forms = mechanize.ParseResponse(r1) form = forms[0] form.set_all_readonly(False) form['locationCriteria'] = location form['locationPosition'] = position form['interest'] = ["camping"] form['lookingFor'] = ['2003'] form['camping_2003_3012'] = '3' # form['camping_2003_moreOptions'] = ['false'] form['campingDate'] = date form['lengthOfStay'] = str(lenghtOfStay) # print form return form.click()
def RA_prep_search(location, position, date, lenghtOfStay, accessNeeds=False): scrape_url = 'http://www.reserveamerica.com/unifSearchResults.do' req = mechanize.Request(scrape_url) req.add_header("Referer", scrape_url) req.add_header( 'user-agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36' ) # req.add_header("Accept-Encoding", "gzip, deflate") req.add_header( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) req.add_header('Origin', 'http://www.reserveamerica.com') r1 = mechanize.urlopen(req) forms = mechanize.ParseResponse(r1, backwards_compat=False) form = forms[0] form.set_all_readonly(False) form['locationCriteria'] = location form['locationPosition'] = position form['interest'] = ["camping"] form['lookingFor'] = ['2003'] form['camping_2003_3012'] = '3' # print form # control = form.find_control("camping_2003_3009") # for item in control.items: # print " name=%s values=%s" % (item.name, str([label.text for label in item.get_labels()])) # return if accessNeeds: form['camping_2003_moreOptions'] = ['true'] form['camping_2003_3009'] = ['true'] else: form['camping_2003_moreOptions'] = [] form['camping_2003_3009'] = [] form['campingDate'] = date form['lengthOfStay'] = str(lenghtOfStay) return form.click()
with open(sys.argv[1]) as input_file: for row_o in csv.reader(input_file): # first row is username, password, and pause length if any(row_o) & (line_o == 0): username, password, pause = row_o pause = int(pause) if (pause_override == None) else pause_override # echo login and pause length information logging.info('Username: '******'Password: '******'Pause: ' + str(pause)) # login to Google with username and password response = br.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/') forms = mechanize.ParseResponse(response) form = forms[0] form['Email'] = username form['Passwd'] = password response = br.open(form.click()) # remaining rows are query and countries elif any(row_o) & (line_o > 0) & (start_line <= line_o) & (line_o <= stop_line): # output filename root output_root = sys.argv[1].replace('.csv', '_' + str(line_o)) # pause before subsequent queries if (line_o > 1): time.sleep(pause) # generate query url
<frame name="buttons" src="/ITE/common/html/buttons.html" marginwidth="0" marginheight="0" scrolling="no" frameborder="0" noresize="noresize"\> <frame name="nada" src="/ITE/common/html/nada.htm" marginwidth="0" marginheight="0" scrolling="0" frameborder="0" noresize="noresize"\> </frameset> <noframes> <body> <p> Seu Browser não suporta frames. </p> </body> </noframes> </html> ''' url2 = 'https://wwws3.hsbc.com.br/HWB-SIMULADOR/servlets/SrvSimulador?ServletState=10' url3 = 'https://wwws3.hsbc.com.br/HWB-SIMULADOR/servlets/SrvSimulador?ServletState=30' import sys import mechanize request = mechanize.Request(url) response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) #response.close() ## f = open("example.html") ## forms = mechanize.ParseFile(f, "http://example.com/example.html", ## backwards_compat=False) ## f.close() form = forms[0] print form # very useful!
def get_session(self, session_url=None, session_id=None): """ Load session details for the given detail page URL or numeric ID """ # Read either session_id or session_url from the opposite if session_id is not None: session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id elif session_url is not None: parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], session_url) session_id = parsed['session_id'] logging.info("Getting session %d from %s", session_id, session_url) session = Session(numeric_id=session_id) time.sleep(self.config.WAIT_TIME) response = self.user_agent.open(session_url) # forms for later attachment download mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) # seek(0) is necessary to reset response pointer. response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # check for page errors try: page_title = dom.xpath('//h1')[0].text if 'Fehlermeldung' in page_title: logging.info("Page %s cannot be accessed due to server error", session_url) if self.options.verbose: print "Page %s cannot be accessed due to server error" % session_url return if 'Berechtigungsfehler' in page_title: logging.info("Page %s cannot be accessed due to permissions", session_url) if self.options.verbose: print "Page %s cannot be accessed due to permissions" % session_url return except: pass try: error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip() if 'Keine Daten gefunden' in error_h3: logging.info("Page %s does not contain any agenda items", session_url) if self.options.verbose: print "Page %s does not contain agenda items" % session_url return except: pass session.original_url = session_url # Session title try: session.title = dom.xpath( self.xpath['SESSION_DETAIL_TITLE'])[0].text except: logging.critical( 'Cannot find session title element using XPath SESSION_DETAIL_TITLE' ) raise TemplateError( 'Cannot find session title element using XPath SESSION_DETAIL_TITLE' ) # Committe link try: links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK']) for link in links: href = link.get('href') parsed = parse.search( self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href) if parsed is not None: session.committee_id = parsed['committee_id'] except: logging.critical( 'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH' ) raise TemplateError( 'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH' ) # Session identifier, date, address etc tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical( 'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) raise TemplateError( 'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) else: for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() nextcontent = tds[n + 1].text.strip() except: continue if tdcontent == 'Sitzung:': session.identifier = nextcontent elif tdcontent == 'Gremium:': session.committee_name = nextcontent elif tdcontent == 'Datum:': datestring = nextcontent if tds[n + 2].text == 'Zeit:': if (n + 3) in tds and tds[n + 3].text is not None: datestring + ' ' + tds[n + 3].text session.date_start = datestring elif tdcontent == 'Raum:': session.address = " ".join(tds[n + 1].xpath('./text()')) elif tdcontent == 'Bezeichnung:': session.description = nextcontent if not hasattr(session, 'identifier'): logging.critical( 'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD' ) raise TemplateError( 'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD' ) # Agendaitems found_attachments = [] rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS']) if len(rows) == 0: logging.critical( 'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') raise TemplateError( 'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') else: agendaitems = {} agendaitem_id = None public = True for row in rows: row_id = row.get('id') row_classes = row.get('class').split(' ') fields = row.xpath('td') number = fields[0].xpath('./text()') if len(number) > 0: number = number[0] if number == []: number = None #print "number: %s" % number if row_id is not None: # Agendaitem main row agendaitem_id = row_id.rsplit('_', 1)[1] agendaitems[agendaitem_id] = {} agendaitems[agendaitem_id]['id'] = int(agendaitem_id) if number is not None: agendaitems[agendaitem_id]['number'] = number agendaitems[agendaitem_id]['subject'] = "; ".join( fields[1].xpath('./text()')) agendaitems[agendaitem_id]['public'] = public # submission links links = row.xpath( self. xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK']) submissions = [] for link in links: href = link.get('href') if href is None: continue parsed = parse.search( self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) if parsed is not None: submission = Submission(numeric_id=int( parsed['submission_id']), identifier=link.text) submissions.append(submission) # Add submission to submission queue if hasattr(self, 'submission_queue'): self.submission_queue.add( int(parsed['submission_id'])) if len(submissions): agendaitems[agendaitem_id]['submissions'] = submissions """ Note: we don't scrape agendaitem-related attachments for now, based on the assumption that they are all found via submission detail pages. All we do here is get a list of attachment IDs in found_attachments """ #attachments = [] forms = row.xpath('.//form') for form in forms: for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') #attachments.append(attachment_id) found_attachments.append(attachment_id) #if len(attachments): # agendaitems[agendaitem_id]['attachments'] = attachments elif 'smc_tophz' in row_classes: # additional (optional row for agendaitem) label = fields[1].text value = fields[2].text if label is not None and value is not None: label = label.strip() value = value.strip() #print (label, value) if label in ['Ergebnis:', 'Beschluss:']: if value in self.config.RESULT_STRINGS: agendaitems[agendaitem_id][ 'result'] = self.config.RESULT_STRINGS[ value] else: logging.warn( "String '%s' not found in configured RESULT_STRINGS", value) if self.options.verbose: print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value agendaitems[agendaitem_id]['result'] = value elif label == 'Bemerkung:': agendaitems[agendaitem_id]['result_note'] = value elif label == 'Abstimmung:': agendaitems[agendaitem_id]['voting'] = value else: logging.critical( "Agendaitem info label '%s' is unknown", label) raise ValueError( 'Agendaitem info label "%s" is unknown' % label) elif 'smcrowh' in row_classes: # Subheading (public / nonpublic part) if fields[ 0].text is not None and "Nicht öffentlich" in fields[ 0].text.encode('utf-8'): public = False #print json.dumps(agendaitems, indent=2) session.agendaitems = agendaitems.values() # session-related attachments containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS']) for container in containers: classes = container.get('class') if classes is None: continue classes = classes.split(' ') if self.xpath[ 'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue attachments = [] rows = container.xpath('.//tr') for row in rows: forms = row.xpath('.//form') for form in forms: #print "Form: ", form name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') # make sure to add only those which aren't agendaitem-related if attachment_id not in found_attachments: attachment = Attachment(identifier=attachment_id, name=name) # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: #print "Form found: '%s'" % mform for control in mform.controls: if control.name == 'DT' and control.value == attachment_id: #print "Found matching form: ", control.name, control.value attachment = self.get_attachment_file( attachment, mform) attachments.append(attachment) found_attachments.append(attachment_id) if len(attachments): session.attachments = attachments oid = self.db.save_session(session) if self.options.verbose: logging.info("Session %d stored with _id %s", session_id, oid)
def __init__(self): self.url = "https://store.steampowered.com/join/" self.site_data = urllib2.urlopen(self.url) self.forms = mechanize.ParseResponse(self.site_data, backwards_compat=False) self.form = self.forms[1] #currently true, but this line will cause this script to eventually break self.captchagid = self.form.find_control(id="captchagid").value
def getStars(age, zeta): """ Returns a list of stars (for use by e.g. phoenix grid) with necessary parameters to determine their characteristics and magnitude. Inputs: ------- age: float The age of the stellar population, in years zeta: float The metal content of the stars, where the Sun has 0.019 as its metal content. Output: ------- star_list: list of lists The output stars. The list has the following columns: Z: [M/H] Age (Gyr) M_ini (M_\odot) M_act (M_\odot) Te log(g) int_IMF Johnson,I """ result_str = re.compile(r"The results are available at <a href=(.*?)>output\d*\.dat</a>") request = mechanize.Request("http://stev.oapd.inaf.it/cgi-bin/cmd_2.5") response = mechanize.urlopen(request) forms = mechanize.ParseResponse(response,backwards_compat=False) response.close() form = forms[0] #The reasoning here is that I can *get* Johnson filters in JWST pysynphot, but can't figure #out how to do the Spitzer equivalents. # form["photsys_file"] = ["tab_mag_odfnew/tab_mag_2mass_spitzer_wise.dat"] form["photsys_file"] = ["tab_mag_odfnew/tab_mag_ubvrijhk.dat"] #0 = single isochrone, single metallicity. #1 = sequence of isochrones, different ages, constant metallicity #2 = sequence of isochrones, constant age, different metallicities form["isoc_val"] = ["0"] #Age for single-single form["isoc_age"] = '%g' % (age) form["isoc_zeta"] = '%g' % (zeta) request2 = form.click() response2 = mechanize.urlopen(request2) response_value = response2.read() response_url = response2.geturl() match = result_str.search(response_value) star_list = [] if match is not None: output_url = match.group(1) response_result = mechanize.urlopen(mechanize.urljoin(response_url,output_url)) output_lines = response_result.read().split("\n") output_lines = output_lines[13:] for line in output_lines: if line != "": #Z, log(age/yr), M_ini, M_act, logL/Lo, logTe, logG, mbol, U, B, V, R, I, J, H, K, int_IMF, stage items = line.split() star = [None] star.append(getZ(float(items[0]))) star.append(10**float(items[1])) star.append(float(items[2])) star.append(float(items[3])) star.append(10**float(items[5])) star.append(float(items[6])) star.append(float(items[6])) star.append(float(items[16])) star.append(float(items[12])) star_list.append(star) return star_list
def _getForm(self, response): forms = mechanize.ParseResponse(response, backwards_compat=False) return forms[0]
def calculate_RH(seq_list, pore_size=100, ion_pairing_agent='TFA', pH=2, proxy=''): # Find cached RHs in database. output = dict([(seq, None) for seq in seq_list]) database = shelve.open(DATABASE_FILENAME, writeback=True) for seq in seq_list: if (seq in database and (pore_size, ion_pairing_agent, pH) in database[seq]): output[seq] = database[seq][(pore_size, ion_pairing_agent, pH)] remaining_seq = [] for seq, RH in output.items(): if RH is None: remaining_seq.append(seq) if not remaining_seq: return output # If there undefined RHs, obtain them at the SSRCalc site: if proxy: proxy_handler = urllib2.ProxyHandler({'http': proxy}) opener = urllib2.build_opener(proxy_handler) opener.addheaders = [ ('Host', '2ip.ru\n'), ('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.0.2) ' 'Gecko/20060308 Firefox/1.5.0.2\n'), ('Accept', 'text/xml,application/xml,application/xhtml+xml,text/html;' 'q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\n'), ('Accept-Language', 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3\n'), ('Accept-Charset', 'windows-1251,utf-8;q=0.7,*;q=0.7\n'), ('X-Forwarded-For', '44.55.66.77\n'), ('Pragma', 'no-cache\n'), ('Referer', 'http://www.test.com\n'), ('Keep-Alive', '500\n'), ('Connection', 'close\n'), ('Content-Type', 'application/x-www-form-urlencoded\r\n\r\n') ] urllib2.install_opener(opener) request = urllib2.Request('http://hs2.proteome.ca/SSRCalc/SSRCalcX.html') response = urllib2.urlopen(request) forms = mechanize.ParseResponse(response, backwards_compat=False) response.close() form = forms[0] if ion_pairing_agent == 'FA': form['sver'] = ['ssrFA'] elif pH == 10: form['sver'] = ['ssrXT'] elif pore_size == 100: form['sver'] = ['ssr100'] elif pore_size == 300: form['sver'] = ['ssr300'] form['seqs'] = "\n".join(remaining_seq) result = urllib2.urlopen(form.click()) result = result.read() processed_seq_re = re.compile( r'(?<=\<tr class\=\"bodyText\"\>\<td\>)\S+\n?\S+') processed_RH_re = re.compile(r'(\(\d+\)\<\/td\>\n\<td\>\s+)(-?\d+.?\d+)') processed_seq = processed_seq_re.findall(result) processed_RH = [float(rh[1]) for rh in processed_RH_re.findall(result)] processed_data = dict(zip(processed_seq, processed_RH)) # Caching obtained data. for seq, RH in processed_data.items(): seq = remove_html_tags(seq) entry = database.get(seq, {}) entry[(pore_size, ion_pairing_agent, pH)] = RH database[seq] = entry output[seq] = RH database.close() return output
def get_submission(self, submission_url=None, submission_id=None): """ Load submission (Vorlage) details for the submission given by detail page URL or numeric ID """ # Read either submission_id or submission_url from the opposite if submission_id is not None: submission_url = self.urls[ 'SUBMISSION_DETAIL_PRINT_PATTERN'] % submission_id elif submission_url is not None: parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], submission_url) submission_id = parsed['submission_id'] logging.info("Getting submission %d from %s", submission_id, submission_url) submission = Submission(numeric_id=submission_id) try_until = 1 try_counter = 0 try_found = False while (try_counter < try_until): try_counter += 1 try_found = False time.sleep(self.config.WAIT_TIME) try: response = self.user_agent.open(submission_url) except urllib2.HTTPError, e: if e.code == 404: sys.stderr.write( "URL not found (HTTP 404) error caught: %s\n" % submission_url) sys.stderr.write( "Please check BASE_URL in your configuration.\n") sys.exit(1) mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config) try: page_title = dom.xpath('//h1')[0].text if 'Fehler' in page_title: try_until = 3 try_found = True logging.info( "Original RIS Server Bug, restart scraping submission %s", submission_url) except: pass if (try_found == False): # check for page errors try: if 'Fehlermeldung' in page_title: logging.info( "Page %s cannot be accessed due to server error", submission_url) if self.options.verbose: print "Page %s cannot be accessed due to server error" % submission_url return if 'Berechtigungsfehler' in page_title: logging.info( "Page %s cannot be accessed due to permissions", submission_url) if self.options.verbose: print "Page %s cannot be accessed due to permissions" % submission_url return except: pass submission.original_url = submission_url # Session title try: stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE']) submission.title = stitle[0].text except: logging.critical( 'Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE' ) raise TemplateError( 'Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE' ) # Submission identifier, date, type etc tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical( 'Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD' ) logging.critical('HTML Dump:' + html) raise TemplateError( 'Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD' ) else: current_category = None for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() except: continue if tdcontent == 'Name:': submission.identifier = tds[n + 1].text.strip() elif tdcontent == 'Art:': submission.type = tds[n + 1].text.strip() elif tdcontent == 'Datum:': submission.date = tds[n + 1].text.strip() elif tdcontent == 'Name:': submission.identifier = tds[n + 1].text.strip() elif tdcontent == 'Betreff:': submission.subject = '; '.join( tds[n + 1].xpath('./text()')) elif tdcontent == 'Referenzvorlage:': link = tds[n + 1].xpath('a')[0] href = link.get('href') parsed = parse.search( self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) submission.superordinate = { 'identifier': link.text.strip(), 'numeric_id': parsed['submission_id'] } # add superordinate submission to queue if hasattr(self, 'submission_queue'): self.submission_queue.add( parsed['submission_id']) # subordinate submissions are added to the queue elif tdcontent == 'Untergeordnete Vorlage(n):': current_category = 'subordinates' for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search( self. urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) if hasattr(self, 'submission_queue' ) and parsed is not None: #add subordinate submission to queue self.submission_queue.add( parsed['submission_id']) else: if current_category == 'subordinates': for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search( self.urls[ 'SUBMISSION_DETAIL_PARSE_PATTERN'], href) if hasattr(self, 'submission_queue' ) and parsed is not None: self.submission_queue.add( parsed['submission_id']) if not hasattr(submission, 'identifier'): logging.critical( 'Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) raise TemplateError( 'Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) # "Beratungsfolge"(list of sessions for this submission) # This is currently not parsed for scraping, but only for # gathering session-attachment ids fpr later exclusion found_attachments = [] rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS']) for row in rows: formfields = row.xpath( './/input[@type="hidden"][@name="DT"]') if len(formfields): attachment_id = formfields[0].get('value') if attachment_id is not None: found_attachments.append(attachment_id) # submission-related attachments submission.attachments = [] containers = dom.xpath( self.xpath['SUBMISSION_DETAIL_ATTACHMENTS']) for container in containers: try: classes = container.get('class').split(' ') except: continue if self.xpath[ 'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue rows = container.xpath('.//tr') for row in rows: forms = row.xpath('.//form') for form in forms: name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath( 'input[@name="DT"]'): attachment_id = hidden_field.get('value') if attachment_id in found_attachments: continue attachment = Attachment( identifier=attachment_id, name=name) #print attachment_id # Traversing the whole mechanize response to submit this form #print mechanize_forms for mform in mechanize_forms: #print "Form found: '%s'" % mform for control in mform.controls: if control.name == 'DT' and control.value == attachment_id: attachment = self.get_attachment_file( attachment, mform) submission.attachments.append( attachment) # forcing overwrite=True here oid = self.db.save_submission(submission)
import re import scraperwiki import string base = "http://www.dleg.state.mi.us/bcs_corp/" search_page = "sr_corp.asp" result_page = "dt_corp.asp" # Next page link: rs_corp.asp?s_button=sname&v_search=a&hiddenField=&offset=40 # Get the main name search form print urljoin(base, search_page) main_page = urlopen(urljoin(base, search_page)) br = mechanize.Browser() br.open(base) forms = mechanize.ParseResponse(main_page, backwards_compat=False) form = forms[0] print form # Search for something: form.set_value("a", name="v_search") br.open(form.click()) # Find all URLS that begin with 'dt_corp.asp' # for letter in string.lowercase: # print letter # You got cookie. # So share it maybe? from bs4 import BeautifulSoup