def getAndSaveAssetURL(button_value, first_index): page = parse( 'http://www.semic.eu/semic/view/snav/assetRepository.xhtml').getroot() if first_index != 0: page.forms[2].fields["j_id81:j_id119:0:j_id340"] = str(first_index) page.forms[2].fields["j_id81:j_id119:0:j_id481"] = button_value page.forms[2].fields["j_id81"] = "j_id81" page.forms[2].fields[ "navpage"] = "aece014c-5019-4751-a728-5090322ab3f9" page.forms[2].fields["j_id81:j_id103"] = ["ASSET"] page.forms[2].fields["j_id81:j_id119:0:j_id341"] = "sortBy_name_asc" result = parse( submit_form( page.forms[2], extra_values={'j_id81:j_id119:0:j_id481': button_value})).getroot() names = [a.text for a in result.xpath("//td[@class='textColumn']/a")] urls = [ a.attrib['href'] for a in result.xpath("//td[@class='textColumn']/a") ] saveInDataStore(names, urls) print len(urls) else: result = parse(submit_form(page.forms[2])).getroot() names = [a.text for a in result.xpath("//td[@class='textColumn']/a")] urls = [ a.attrib['href'] for a in result.xpath("//td[@class='textColumn']/a") ] saveInDataStore(names, urls)
def place_order(self, market, amount, rate): """ Place an order to lend money on the market :param market: one of the name held in self.markets :param amount: Amount to lend in GBP :param rate: Offered rate in percentage """ url = self._lending_url[market] logger.debug("GET request URL: %s", url) page = self._session.get(url) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] # asp.net form require the button that was clicked .. form.fields["__EVENTTARGET"] = "ctl00$cphContentArea$btnSetRate" form.fields["ctl00$cphContentArea$tbAmount"] = str(amount) form.fields["ctl00$cphContentArea$tbRate"] = str(rate * 100) logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] # Check if we have arrived on the confirmation page confirm_tag = tree.xpath( './/div[@class="singleContainer"]/descendant::h3') if len(confirm_tag) > 0 and confirm_tag[0].text.strip( ' \n\r') == "Confirm your order": logger.debug("Request was accepted. Need to confirm") else: error_tag = tree.xpath('.//div[@class="contextError"]') message = error_tag[0].text.strip('- \n\r') logger.debug("Request refused: %s", message) raise RateSetterException(message) # asp.net form require the button that was clicked .. form.fields["__EVENTTARGET"] = "ctl00$cphContentArea$btnOrder" page = html.submit_form(form, open_http=self._get_http_helper()) tree = html.fromstring(page.text, base_url=page.url) # Check if the order has been confirmed confirm_tag = tree.xpath( './/div[@class="singleContainer"]/descendant::h3') if len(confirm_tag) > 0 and confirm_tag[0].text.strip( ' \n\r') == "Your order has been placed": logger.debug("Request was confirmed") else: logger.debug("Cannot confirm order") raise RateSetterException('Cannot confirm order')
def place_order(self, market, amount, rate): """ Place an order to lend money on the market :param market: one of the name held in self.markets :param amount: Amount to lend in GBP :param rate: Offered rate in percentage """ url = self._lending_url[market] logger.debug("GET request URL: %s", url) page = self._session.get(url) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] # asp.net form require the button that was clicked .. form.fields["__EVENTTARGET"] = "ctl00$cphContentArea$btnSetRate" form.fields["ctl00$cphContentArea$tbAmount"] = str(amount) form.fields["ctl00$cphContentArea$tbRate"] = str(rate*100) logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] # Check if we have arrived on the confirmation page confirm_tag = tree.xpath('.//div[@class="singleContainer"]/descendant::h3') if len(confirm_tag) > 0 and confirm_tag[0].text.strip(' \n\r') == "Confirm your order": logger.debug("Request was accepted. Need to confirm") else: error_tag = tree.xpath('.//div[@class="contextError"]') message = error_tag[0].text.strip('- \n\r') logger.debug("Request refused: %s", message) raise RateSetterException(message) # asp.net form require the button that was clicked .. form.fields["__EVENTTARGET"] = "ctl00$cphContentArea$btnOrder" page = html.submit_form(form, open_http=self._get_http_helper()) tree = html.fromstring(page.text, base_url=page.url) # Check if the order has been confirmed confirm_tag = tree.xpath('.//div[@class="singleContainer"]/descendant::h3') if len(confirm_tag) > 0 and confirm_tag[0].text.strip(' \n\r') == "Your order has been placed": logger.debug("Request was confirmed") else: logger.debug("Cannot confirm order") raise RateSetterException('Cannot confirm order')
def connect(self): """Connect the client to RateSetter """ logger.debug("Authenticating ratesetter client") logger.debug("GET request URL: %s", home_page_url) page = self._session.get(home_page_url) tree = html.fromstring(page.text, base_url=page.url) self._sleep_if_needed() a = tree.xpath('.//a[contains(text(),"Login")]') page = self._session.get(a[0].attrib['href']) tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] # asp.net form require the button that was clicked .. form.fields["__EVENTTARGET"] = "ctl00$cphContentArea$cphForm$btnLogin" form.fields["ctl00$cphContentArea$cphForm$txtEmail"] = self._email form.fields["ctl00$cphContentArea$cphForm$txtPassword"] = self._password logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) if "login.aspx" in page.url: raise RateSetterException("Failed to connect") if "your_lending/summary" not in page.url: raise RateSetterException("Site has changed") self._dashboard_url = page.url tree = html.fromstring(page.text, base_url=page.url) self._extract_url(tree) self._connected = True
def cancel_order(self, order): """Cancel an order placed previously :param order: a DataFrame with one record. Must contain a cancel_url series :return: """ logger.debug("Cancelling order %s", order.id) logger.debug("GET request URL: %s", order.cancel_url) page = self._session.get(order.cancel_url) tree = html.fromstring(page.text, base_url=page.url) # Check if we have arrived on the confirmation page confirm_tag = tree.xpath('.//form/descendant::h1') if len(confirm_tag) > 0 and confirm_tag[0].text.strip( ' \n\r') == "Cancel Order": logger.debug("Request was accepted. Need to confirm") form = tree.forms[0] form.fields[ "__EVENTTARGET"] = "ctl00$cphContentArea$cphForm$btnConfirm" logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] else: raise RateSetterException('Cannot cancel order')
def connect(self): """Connect the client to RateSetter """ logger.debug("Authenticating ratesetter client") logger.debug("GET request URL: %s", home_page_url) page = self._session.get(home_page_url) tree = html.fromstring(page.text, base_url=page.url) self._sleep_if_needed() a = tree.xpath('.//a[contains(text(),"Login")]') page = self._session.get(a[0].attrib['href']) tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] # asp.net form require the button that was clicked .. form.fields["__EVENTTARGET"] = "ctl00$cphContentArea$cphForm$btnLogin" form.fields["ctl00$cphContentArea$cphForm$txtEmail"] = self._email form.fields[ "ctl00$cphContentArea$cphForm$txtPassword"] = self._password logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) if "login.aspx" in page.url: raise RateSetterException("Failed to connect") if "your_lending/summary" not in page.url: raise RateSetterException("Site has changed") self._dashboard_url = page.url tree = html.fromstring(page.text, base_url=page.url) self._extract_url(tree) self._connected = True
def submit_form(self, form=None, url=None, extra_values=None): if form is None: raise ValueError(f'form cannot be None; url={url}') def submit(method, form_action_url, values): values = dict(values) if 'csrfmiddlewaretoken' not in values: raise ValueError('Possibly the wrong form. Could not find ' 'csrfmiddlewaretoken: {}'.format( repr(values))) response = self.client.post( url or form_action_url, values, allow_redirects=False, catch_response=True, ) if response.status_code not in (301, 302): # This probably means the form failed and is displaying # errors. response.failure( 'Form submission did not redirect; status={}'.format( response.status_code)) return submit_form(form, open_http=submit, extra_values=extra_values)
def cancel_order(self, order): """Cancel an order placed previously :param order: a DataFrame with one record. Must contain a cancel_url series :return: """ logger.debug("Cancelling order %s", order.id) logger.debug("GET request URL: %s", order.cancel_url) page = self._session.get(order.cancel_url) tree = html.fromstring(page.text, base_url=page.url) # Check if we have arrived on the confirmation page confirm_tag = tree.xpath('.//form/descendant::h1') if len(confirm_tag) > 0 and confirm_tag[0].text.strip(' \n\r') == "Cancel Order": logger.debug("Request was accepted. Need to confirm") form = tree.forms[0] form.fields["__EVENTTARGET"] = "ctl00$cphContentArea$cphForm$btnConfirm" logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] else: raise RateSetterException('Cannot cancel order')
def copy_document(file_id, ticket1, ticket2, auth=None): logger.debug('[Document] Starting') if not auth: return from lxml.html import fromstring, submit_form import requests try: olddoc = ticket1.get_document(file_id) except AssemblaError: logger.debug('[Document] not found (probably deleted) %s', file_id) return login_url = 'https://www.assembla.com/do_login' client = requests.session() login_form = fromstring(client.get(login_url).content).forms[0] login_form.fields['user[login]'] = auth[0] login_form.fields['user[password]'] = auth[1] login_response = submit_form(login_form, open_http=client.request) if login_response.status_code != 200: raise AssemblaError('Failed on file download, status: %s' % login_response.status_code, response=login_response.content) response=client.get(olddoc.url) logger.debug('[Document] Attaching %s', olddoc.name) try: newdoc=ticket2.attach_file(response, olddoc) except AssemblaError, e: logger.debug('[Document] AssemblaError body:\n\n%s\n', e.response.content) logger.debug('[Document] Failed to attach source document: %s', olddoc.id)
def login(self, username, password): doc = parse(self.request('http://www.wingdns.com/')).getroot() form = doc.forms[0] form.fields['username'] = username.encode() form.fields['pwd'] = password.encode() form.fields['checkbox'] = b'checkbox' r = submit_form(form, open_http=self.http_open).read().decode() for u in authCookiePageURL.findall(r): self.request(u).read()
def getAndSaveAssetURL(button_value ,first_index): page = parse('http://www.semic.eu/semic/view/snav/assetRepository.xhtml').getroot() if first_index!=0 : page.forms[2].fields["j_id81:j_id119:0:j_id340"] = str(first_index) page.forms[2].fields["j_id81:j_id119:0:j_id481"] = button_value page.forms[2].fields["j_id81"] ="j_id81" page.forms[2].fields["navpage"] = "aece014c-5019-4751-a728-5090322ab3f9" page.forms[2].fields["j_id81:j_id103"] = ["ASSET"] page.forms[2].fields["j_id81:j_id119:0:j_id341"] = "sortBy_name_asc" result = parse(submit_form(page.forms[2],extra_values={'j_id81:j_id119:0:j_id481': button_value})).getroot() names=[a.text for a in result.xpath("//td[@class='textColumn']/a")] urls=[a.attrib['href'] for a in result.xpath("//td[@class='textColumn']/a")] saveInDataStore(names,urls) print len(urls) else : result = parse(submit_form(page.forms[2])).getroot() names=[a.text for a in result.xpath("//td[@class='textColumn']/a")] urls=[a.attrib['href'] for a in result.xpath("//td[@class='textColumn']/a")] saveInDataStore(names,urls)
def connect(self): """Connect the client from Zopa""" # create a new http session each time we attempt a new connection self._init_session() # pull zopaclient signup page logger.debug("GET request URL: %s", zopa_url) page = self._session.get(zopa_url) self._sleep_if_needed() # fill the signup form tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] form.fields["email"] = self._email form.fields["password"] = self._password logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) self._sleep_if_needed() # check if we have landed on the secret verification page url = page.url if not "login/confirm" in url: raise ZopaExceptionSiteChanged("Unexpected page") # fill the idea verification form tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] form.fields["answer"] = self._security_questions[form.fields["question"]] logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) self._sleep_if_needed() # check if we have landed on the dashboard page url = page.url if not "/dashboard" in url: raise ZopaExceptionSiteChanged("Unexpected page")() self._connected = True self._dashboard_url = url tree = html.fromstring(page.text, base_url=page.url) tree.make_links_absolute(page.url) self._extract_url(tree)
def SendForm( self ): self.FuncName=self.SendForm.__name__ self.__RegisterDef__() try: Aurl=urllib2.Request( self.WebBuffer['lxml'].forms[0].action ) ConfirmHost=Aurl.get_host() except ValueError: Burl=urllib2.Request( self.Url ) self.WebBuffer['lxml'].forms[0].action = self.URI + Burl.get_host() + self.WebBuffer['lxml'].forms[0].action self.WebBuffer['lxml_result'] = parse( submit_form( self.WebBuffer['lxml'].forms[0] ) ).getroot()
def submit(self, form, **kwargs): try: response = submit_form(form._form, open_http=self._open_http, **kwargs) self.contents = response.read() self.url = response.url self._tree = fromstring(self.contents, base_url=self.url) self.info = response.info() self.code = 200 return response except urllib2.URLError, e: self.code = e.code self.contents = ''
def submit_form(self, form=None, url=None, extra_values=None): if form is None: raise ValueError('form cannot be None; url={}'.format(url)) def submit(method, form_action_url, values): values = dict(values) if 'csrfmiddlewaretoken' not in values: raise ValueError( 'Possibly the wrong form. Could not find ' 'csrfmiddlewaretoken: {}'.format(repr(values))) with self.client.post( url or form_action_url, values, allow_redirects=False, catch_response=True) as response: if response.status_code not in (301, 302): # This probably means the form failed and is displaying # errors. # TODO: scrape out the errors. response.failure( 'Form submission did not redirect; status={}' .format(response.status_code)) submit_form(form, open_http=submit, extra_values=extra_values)
def safari_login(user, password): """Login to the Safari website to load a session cookie.""" logging.debug("Connecting to the Safari website at %s." % URL_SAFARI_LOGIN) doc = html.fromstring(urllib2.urlopen(URL_SAFARI_LOGIN).read(), base_url=URL_SAFARI_LOGIN) login_forms = doc.cssselect('form[name="login"]') if not login_forms: logging.critical('Unable to find the login form, can\'t continue.') raise EOFError login_form = login_forms[0] login_form.fields['login'] = user login_form.fields['password'] = password # For the purpose of this script, we assume success, so we don't care about # the result. We really should verify this, though. logging.info("Logging into the account") logging.debug('Submitting login form') # lxml uses urllib by default for downloads. Since we've patched it for # cookie support, this is sufficient for our needs. html.submit_form(login_form)
def run(self, days_back=7): url = 'http://www.portsmouthva.gov/ppd/ArrestIncidents/incidentsearch.aspx' doc = parse(url).getroot() doc = parse(submit_form(doc.forms[0])).getroot() rows = doc.cssselect('tr') min_date = datetime.date.today() - datetime.timedelta(days_back) for row in rows[1:]: data = self.parse_row(row, min_date) # parse_row will return false if it's before min_date if not data: break self.save(data)
def submit_form(self): """Submit the form filled with form_data property dict Raise: oct.core.exceptions.NoFormWaiting :return: Response object after the submit """ if not self._form_waiting: raise NoFormWaiting('No form waiting to be send') self.form.fields = self.form_data r = lh.submit_form(self.form, open_http=self._open_session_http) resp = self._parse_html(r) if self._history is not None: self._history.append_item(resp) self._url = resp.url self.form_data = None self.form = None return resp
def make_request(): login_url = "http://login.weibo.cn/login/" response = requests.get(login_url) print response html_tree = fromstring(response.text.encode(ENCODING)) form = html_tree.forms[0] # extract password field. password_field = None username_field = b"mobile" for key in form.fields.keys(): if "password" in key: password_field = key break form.fields[username_field] = "*****@*****.**" form.fields[password_field] = "huntzhan" # fill form. encode = lambda x: x.encode(ENCODING) login_form = {} for key, value in form.form_values(): print key, value login_form[encode(key)] = encode(value) # login_form['submit'] = encode("登录") form.fields = login_form print "####################" print login_form print login_url + form.action, print response.cookies print "####################" # login_response = requests.post( # login_url, # data=login_form, # headers=response.headers, # cookies=response.cookies, # ) login_response = submit_form(form) return login_response
def releasesData(url): page = parse(url).getroot() result = parse(submit_form(page.forms[2], extra_values={"j_id82:j_id510": "Release downloads"})).getroot() data = [a.attrib["href"] for a in result.xpath("//table[@class='releases']/tbody/tr/td[1]/a")] print data for link in data: release_page = parse("http://www.semic.eu/semic/view/Asset/" + link).getroot() # Publication date and state pub_date_state = [ a.text for a in release_page.xpath("//div[@class='releaseDownloads']/div/div[@class='alignLeft widthAuto']") ] # release No no = [a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[1]")] print no[0].replace("\\t", "") print no[0].replace("\\n", "") # Documentation language(s) lang = [a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[3]/a/span")] # Release Contents release = [a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[4]")] # Lisence Class lisence_class = [a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[5]")] # Lisence lisence = [a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[6]")] # Notes notes = [a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[9]")] data = { "pub_date_state": pub_date_state[0].replace("\\n", "").replace("\\t", ""), "url": url, "number": no[0].replace("\\n", "").replace("\\t", ""), "lang": lang, "release": release, "lisence_class": lisence_class[0].replace("\\n", "").replace("\\t", ""), "lisence": lisence[0].replace("\\n", "").replace("\\t", ""), "notes": notes[0].replace("\\n", "").replace("\\t", ""), } scraperwiki.datastore.save( ["pub_date_state", "url", "number", "lang", "release", "lisence_class", "lisence", "notes"], data, table_name="release", )
def get_loan_book(self): """Download and return the full loan book :return: loan book in csv format :rtype: str """ logger.debug("GET request URL: %s", self._loanbook_url) page = self._session.get(self._loanbook_url) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] # submit the two following values through the extra_values parameters # as they are not part of the initial form values = {"_template$MainControl$Content$MyLoanBookControl$btnDownloadCSV.x": "132", "_template$MainControl$Content$MyLoanBookControl$btnDownloadCSV.y": "7"} logger.debug("Submit form") page = html.submit_form(form, extra_values=values, open_http=self._get_http_helper()) self._sleep_if_needed() return page.text
def get_statement(self, year, month): """Download and return the monthly statement for a given period :param int year: year for which the statement is required :param int month: month within the year for which the statement is required :return: statement in csv format :rtype: str """ logger.debug("GET request URL: %s", self._statement_url) page = self._session.get(self._statement_url) self._sleep_if_needed() tree = html.fromstring(page.text, base_url=page.url) form = tree.forms[0] form.fields["date[month]"] = str(month) if type(month) == int else month form.fields["date[year]"] = str(year) if type(year) == int else year logger.debug("Submit form") page = html.submit_form(form, open_http=self._get_http_helper()) self._sleep_if_needed() return page.text
r = tree.xpath('//input[@type="hidden"][@name="id"]/@value') data['id'] = r[0] r = tree.xpath('//input[@type="hidden"][@name="di"]/@value') data['di'] = r[0] r = tree.xpath('//input[@type="hidden"][@name="verifycode"]/@value') data['verifycode'] = r[0] return post(url, data, url) #<img src="http://verify.baidu.com/cgi-bin/genimg?405F337975374C617586AF6AA757444CEEA46E9BDF5C13A3BC49F126894C7A587D76771ADF9ADB60A443FB00F7B88833A46703767504" width="120" height="40"> #<form action="http://verify.baidu.com/verify"> #<input type="hidden" name="url" value="http://image.baidu.com/i?tn=baiduimagejson&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1349413075627_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=3B大战"> #<input type="hidden" name="vcode" value="405F337975374C617586AF6AA757444CEEA46E9BDF5C13A3BC49F126894C7A587D76771ADF9ADB60A443FB00F7B88833A46703767504"> #<input type="hidden" name="id" value="1380087316"> #<input type="hidden" name="di" value="d026d20ba1148bdb"> #<input type="text" size="6" maxlength="10" name="verifycode" id="kw"> #<input type="submit" value="�ύ"> if __name__=="__main__": file = open("verify.html") xml = "" for line in file: xml += line tree = fromstring(xml) #print xml # r = tree.xpath('//form') form = tree.forms[0] print tree.xpath('//div[@id="vf"]/img/@src')[0] #submit_form(page.forms[1]) #print "--"+form.attrib['action'] verifyCode = raw_input("输入验证码:") form.fields["verifycode"]=verifyCode print submit_form(form).read()
import requests from lxml import html page = html.parse("http://www.suzukimotorcycle.co.in/tab1.aspx").getroot() form = page.forms[0] form.fields = { "ddlZone" : "3", "ddlstate" : "2", "ddlcity" : "408", "Submit" : "Go" } page2 = html.parse(html.submit_form(form)).getroot() print html.tostring(page2)
#!/usr/bin/env python # -*- encoding: utf-8 -*- # generate start url from lxml import html, etree from urllib2 import urlparse from pprint import pprint home_url = 'http://bbs.hefei.cc' get_url = 'search.php?mod=forum' url_xpath = u'//ul[@class="tabs"]//a[.="按时间排序"]/@href' dom = html.parse(home_url).getroot() form = dom.xpath('//form[@id="scbar_form"]')[0] form.set('action', get_url) form.set('method', 'post') form.fields.update({ 'srchtxt':'', }) dom2 = html.parse(html.submit_form(form)).getroot() form2 = dom2.xpath('//form[@id="topSearchBar"]')[0] form2.fields.update({'q':'hello'}) dom3 = html.parse(html.submit_form(form2)).getroot() parsed = urlparse.urlparse(get_url) print '{0.scheme}://{0.netloc}{1}'.format(parsed, dom3.xpath(url_xpath)[0])
states = { "name" : main_page.xpath("//select[@id='ddlstates']/option/text()"), "value" : main_page.xpath("//select[@id='ddlstates']/option/@value") } states["name"].pop(0) states["value"].pop(0) form = main_page.forms[0] for i in range(len(states["name"])): print "*************************************************" print "Processing state: " + states["name"][i] form.fields["ddlstates"] = states["value"][i] citypage = html.parse(html.submit_form(form)).getroot() cities = { "name" : citypage.xpath("//select[@id='ddlcities']/option/text()"), "value" : citypage.xpath("//select[@id='ddlcities']/option/@value") } cities["name"].pop(0) cities["value"].pop(0) for j in range(len(cities["name"])): form = citypage.forms[0] form.fields["ddlcities"] = cities["value"][j] typepage = html.parse(html.submit_form(form)).getroot() types = { "name" : typepage.xpath("//select[@id='ddlnetwork']/option/text()"), "value" : typepage.xpath("//select[@id='ddlnetwork']/option/@value")
def assetData(url): page = parse(url).getroot() result = parse( submit_form(page.forms[2], extra_values={'j_id82:j_id131': 'Asset'})).getroot() #result = parse(submit_form(page.forms[2],extra_values={'j_id82:j_id510': 'Release downloads'})).getroot() print[a.text for a in result.xpath("//a")] #asset title title = [ a.attrib['title'] for a in result.xpath("//div[@class='aboveTabContent clearfix']/h1") ] #Asset ID: asset_id = [ a.text for a in result.xpath( "//div[@class='alignLeft editAssetButton']/span[1]") ] #Initial publication pub_date = [ a.text for a in result.xpath( "//div[@class='alignLeft editAssetButton']/span[2]/a") ] #Last change last_change = [ a.text for a in result.xpath( "//div[@class='alignLeft editAssetButton']/span[3]") ] #spec spec = [ a.text for a in result.xpath("//div[@class='noedit']/p[@class='specTabText']") ] #Represented countries country = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[1]/a/span") ] #Keywords tags = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[2]/a/span") ] #Domains domain = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[3]/a/span") ] #Related Assets rel_col1 = [ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[1]") ] rel_name1 = [ a.attrib['href'] for a in result.xpath("//dd[@class='relatedAssets']/div[1]/a") ] #--- rel_col2 = [ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[2]") ] rel_name2 = [ a.attrib['href'] for a in result.xpath("//dd[@class='relatedAssets']/div[2]/a") ] #--- rel_col3 = [ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[3]") ] rel_name3 = [ a.attrib['href'] for a in result.xpath("//dd[@class='relatedAssets']/div[3]/a") ] #--- rel_col4 = [ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[4]") ] rel_name4 = [ a.attrib['href'] for a in result.xpath("//dd[@class='relatedAssets']/div[4]/a") ] #--- rel_col5 = [ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[5]") ] rel_name5 = [ a.attrib['href'] for a in result.xpath("//dd[@class='relatedAssets']/div[5]/a") ] #--- rel_col6 = [ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[6]") ] rel_name6 = [ a.attrib['href'] for a in result.xpath("//dd[@class='relatedAssets']/div[6]/a") ] #status states = [ a.attrib['alt'] for a in result.xpath( "//div[@class='states']/img[@alt!='This state is in the past']") ] #Related Projects rel_projects = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[5]/div/a") ] #Provided by #Asset Agent agent = [ a.attrib['href'] for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[6]/div/div/a") ] #description descr = [a.text for a in result.xpath("//div[@class='noedit']/p[2]")] #Asset Owner owner = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[6]/div[2]") ] print owner data = { "name": title[0].replace('\\n', '').replace('\\t', ''), "url": url, "asset_id": asset_id[0].replace('\\n', '').replace('\\t', ''), "pub_date": pub_date[0].replace('\\t', '').replace('\\n', ''), "last_change": last_change[0].replace('\\t', '').replace('\\n', ''), "country": country, "tags": tags, "domain": domain, "descr": descr, "rel_projects": rel_projects, "agent": agent, "owner": owner, "state": states[0], "rel_col1": rel_col1, "rel_col2": rel_col2, "rel_col3": rel_col3, "rel_col4": rel_col4, "rel_col5": rel_col5, "rel_col6": rel_col6, "rel_name1": rel_name1, "rel_name2": rel_name2, "rel_name3": rel_name3, "rel_name4": rel_name4, "rel_name5": rel_name5, "rel_name6": rel_name6 } scraperwiki.datastore.save(["url"], data, table_name="Asset")
def submit(step): world.dom.forms[0].action = 'http://localhost:8000/login/' world.dom = fromstring(submit_form(world.dom.forms[0]).read()) print world.dom
def assetData(url): page = parse(url).getroot() result = parse(submit_form(page.forms[2],extra_values={'j_id82:j_id131': 'Asset'})).getroot() #result = parse(submit_form(page.forms[2],extra_values={'j_id82:j_id510': 'Release downloads'})).getroot() print [a.text for a in result.xpath("//a")]
def releasesData(url): page = parse(url).getroot() result2 = parse(submit_form(page.forms[2],extra_values={'j_id82:j_id510': 'Release downloads'})).getroot() result3 = [a.attrib['href'] for a in result2.xpath("//a[@id='j_id82:j_id636']")] if not result3 : print 'empty' else : print result3[0] result =parse('http://www.semic.eu'+result3[0]).getroot() data=[a.attrib['href'] for a in result.xpath("//table[@class='releases']/tbody/tr/td[1]/a")] print data for link in data: release_page = parse('http://www.semic.eu/semic/view/Asset/'+link).getroot() #Publication date and state pub_date_state= [a.text for a in release_page.xpath("//div[@class='releaseDownloads']/div/div[@class='alignLeft widthAuto']")] if not pub_date_state: pub_date_state=[] #release No no= [a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[1]")] if not no: no=[] #Documentation language(s) lang=[a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[3]/a/span")] if not lang: lang=[] #Release Contents release=[a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[4]")] #Lisence Class lisence_class=[a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[5]")] if not lisence_class: lisence_class=[] #Lisence lisence=[a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[6]")] if not lisence: lisence=[] #Notes notes=[a.text for a in release_page.xpath("//dl[@class='assetDefinition clearfix']/dd[9]")] if not notes: notes=[] data={"link":link} data["pub_date_state"]=pub_date_state[0].replace('\n\t\t\t\t\t\t\t', '') data["url"]=url data["number"]=no[0].replace('\n\t\t\t\t\t\t\t', '') data["lang"]=lang data["lisence_class"]=lisence_class[0].replace('\n\t\t\t\t\t\t\t', '') data["lisence"]=lisence[0].replace('\n\t\t\t\t\t\t\t', '') data["notes"]=notes[0].replace('\n\t\t\t\t\t\t\t', '') print release if not release: release=[] else: for kinds in release: kindarr=kinds.split(',') for kind in kindarr: print kind if kind.find('Other')!=-1: data["content"]=kind if kind.find('UML')!=-1 or kind.find('UMM')!=-1 or kind .find( 'BPMN') !=-1 or kind .find( 'FMC') !=-1: data["Model"]=kind if kind .find( 'XML Schema') !=-1 or kind.find('Relax NG') !=-1 or kind .find( 'Schematron') !=-1 or kind .find( 'WSDL')!=-1 or kind .find('ebXML Process Def')!=-1: data["Syntax"]=kind if kind .find( 'Codelists') !=-1 or kind .find( 'Mappings') !=-1 or kind .find( 'Taxonomy') !=-1 or kind .find( 'Ontology') !=-1: data["Semantic"]=kind if kind .find( 'Core Components') !=-1: data["Abstract"]=kind print 'url :'+url if not data: print 'data is empty' else: print data scraperwiki.datastore.save(["link"],data,table_name="Release")
#!/usr/bin/env python3 from lxml.html import fromstring, tostring from lxml.html import parse, submit_form import requests response = requests.get('https://duckduckgo.com') form_page = fromstring(response.text) form = form_page.forms[0] print(tostring(form)) page = parse('http://duckduckgo.com').getroot() page.forms[0].fields['q'] = 'python' result = parse(submit_form(page.forms[0])).getroot() print(tostring(result))
def assetData(url): page = parse(url).getroot() result = parse( submit_form(page.forms[2], extra_values={'j_id82:j_id131': 'Asset'})).getroot() #result = parse(submit_form(page.forms[2],extra_values={'j_id82:j_id510': 'Release downloads'})).getroot() print[a.text for a in result.xpath("//a")] #asset title title = [ a.attrib['title'] for a in result.xpath("//div[@class='aboveTabContent clearfix']/h1") ] #Asset ID: asset_id = [ a.text for a in result.xpath( "//div[@class='alignLeft editAssetButton']/span[1]") ] #Initial publication pub_date = [ a.text for a in result.xpath( "//div[@class='alignLeft editAssetButton']/span[2]/a") ] #Last change last_change = [ a.text for a in result.xpath( "//div[@class='alignLeft editAssetButton']/span[3]") ] #spec spec = [ a.text for a in result.xpath("//div[@class='noedit']/p[@class='specTabText']") ] #Represented countries country = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[1]/a/span") ] #Keywords tags = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[2]/a/span") ] #Domains domain = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[3]/a/span") ] #Related Assets print[a.text for a in result.xpath("//dd[@class='relatedAssets']/div[1]")] print[ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[1]/a") ] #--- print[a.text for a in result.xpath("//dd[@class='relatedAssets']/div[2]")] print[ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[2]/a") ] #--- print[a.text for a in result.xpath("//dd[@class='relatedAssets']/div[3]")] print[ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[3]/a") ] #--- print[a.text for a in result.xpath("//dd[@class='relatedAssets']/div[4]")] print[ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[4]/a") ] #--- print[a.text for a in result.xpath("//dd[@class='relatedAssets']/div[5]")] print[ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[5]/a") ] #--- print[a.text for a in result.xpath("//dd[@class='relatedAssets']/div[6]")] print[ a.text for a in result.xpath("//dd[@class='relatedAssets']/div[6]/a") ] #Related Projects rel_projects = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[5]/div/a") ] #Provided by #Asset Agent agent = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[6]/div/div/a") ] #Asset Owner owner = [ a.text for a in result.xpath( "//dl[@class='assetDefinition clearfix']/dd[6]/div[2]") ] data = { "name": title[0].replace('\\n', '').replace('\\t', ''), "url": url, "asset_id": asset_id[0].replace('\\n', '').replace('\\t', ''), "pub_date": pub_date[0].replace('\\t', '').replace('\\n', ''), "last_change": last_change[0].replace('\\t', '').replace('\\n', ''), "country": country, "tags": tags, "domain": domain, "rel_projects": rel_projects, "agent": agent, "owner": owner } scraperwiki.datastore.save([ "name", "url", "asset_id", "pub_date", "last_change", "country", "tags", "domain", "rel_projects", "agent", "owner" ], data, table_name="Asset")
option = parser.add_mutually_exclusive_group() parser.add_argument("movie", type=str) option.add_argument("-n", "--new", action="store_true", help="Store the plot in a new epub file") option.add_argument("-a", "--app", action="store_true", help="Append to an existing epub file") args = parser.parse_args() reAnchor = re.compile("<a.+?>") reTable = re.compile("<table[\w\W]+<\/table>") schpage = urlopen("https://www.wikipedia.org") schpage = html.parse(schpage).getroot() schpage.forms[0].fields['search'] = args.movie wiki = html.parse(html.submit_form(schpage.forms[0])).getroot() print "Movie plot obtained" #getting the movie title ttlcode = wiki.xpath("//h1")[0] soup = html.tostring(ttlcode) try: title = ttlcode.xpath("i/text()")[0] except IndexError: sys.exit("Error: No such movie found. Try adding '(film)' to the end.") #Getting the plot paragraphs renodes = wiki.xpath("//h2[span='Plot']/following-sibling::*") for renode in renodes: if renode.tag != 'h2': soup += html.tostring(renode)
def fetch(self, url, data=None, headers={}, referer=True, cache_control=None, retries=None): ''' Fetches the data at the given url. If ``data`` is ``None``, we use a GET request, otherwise, we use a POST request. ``data`` can either be a dictionary or a urlencoded string, or an ordered list of two-tuples [('key1','value1'),('key2','value2'), ...] If ``referer`` is False or empty, we don't send the http referer header; otherwise, we send the specified referer. If ``referer`` is True, we try to figure out the correct referer form the ``url`` If a ``cache_control`` parameter is specified, only cached pages with the same cache_control will be used. If a ``retries`` integer argument is specified, page fetches will be retried ``retries`` times on page-load errors. ''' if self.cache_control and not cache_control: cache_control = self.cache_control.next() # determine the correct referer to use if referer is True: # yes, this is right # try to determine the correct referer from the url if hasattr(url, 'base_url'): referer = url.base_url elif (hasattr(url, 'getparent') and hasattr(url.getparent(), 'base_url')): referer = url.getparent().base_url else: # nope... can't determine referer from url referer = None if referer: headers['Referer'] = referer if retries is None: retries = self.retries # if we are passed a 'form' object in stead of a url, submit the form! if not isinstance(url, basestring): from lxml import html # don't import lxml.html until we need it! if isinstance(url, html.FormElement): # IMPORTANT: if any arguments are added to the fetch() # function, then they must be added here, as well!!! http = partial(self._open_http, headers=headers, referer=referer, cache_control=cache_control, retries=retries) return html.submit_form(url, extra_values=data, open_http=http) else: raise ValueError('Crawler.fetch expects url of type ' '<basestring> or <FormElement>. Got: %s' % type(url)) # the post-data needs to be url-encoded if it isn't a string if data is not None and not isinstance(data, basestring): data = urlencode(data, doseq=1) # alright, we're ready to download the page! request = urllib2.Request(url, data=data, headers=headers) if cache_control is not None: request.cache_control = str(cache_control) # download multiple times in case of url-errors... error = None for retry in xrange(retries + 1): try: result = self.opener.open(request) result.__enter__ = lambda: result result.__exit__ = lambda x,y,z: result.close() return result except urllib2.HTTPError, e: if 500 <= e.code < 600 : # if many errors happen, retain the first one print 'passing_HTTP_ERROR:' + url + ':retry:' + str(retry) error = error or e import time import random time.sleep(5 * 2**min(retry, 8) * random.random()) else: raise except urllib2.URLError, e: # check whether we should re-try fetching the page if e.reason.strerror not in ('Connection refused',): # don't retry downloading page if a non-http error # happened raise e else: # if many errors happen, retain the first one print 'passing_CONNECTION_ERROR:' + url + ':retry:' + str(retry) time.sleep(5 * 2**min(retry, 8) * random.random()) error = error or e
def releasesData(url): page = parse(url).getroot() result2 = parse( submit_form(page.forms[2], extra_values={'j_id82:j_id510': 'Release downloads'})).getroot() result3 = [ a.attrib['href'] for a in result2.xpath("//a[@id='j_id82:j_id636']") ] if not result3: print 'empty' else: print result3[0] result = parse('http://www.semic.eu' + result3[0]).getroot() data = [ a.attrib['href'] for a in result.xpath( "//table[@class='releases']/tbody/tr/td[1]/a") ] print data for link in data: release_page = parse('http://www.semic.eu/semic/view/Asset/' + link).getroot() #Publication date and state pub_date_state = [ a.text for a in release_page.xpath( "//div[@class='releaseDownloads']/div/div[@class='alignLeft widthAuto']" ) ] if not pub_date_state: pub_date_state = [] #release No no = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[1]") ] if not no: no = [] #Documentation language(s) lang = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[3]/a/span") ] if not lang: lang = [] #Release Contents release = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[4]") ] #Lisence Class lisence_class = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[5]") ] if not lisence_class: lisence_class = [] #Lisence lisence = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[6]") ] if not lisence: lisence = [] #Notes notes = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[9]") ] if not notes: notes = [] data = {"link": link} data["pub_date_state"] = pub_date_state[0].replace( '\n\t\t\t\t\t\t\t', '') data["url"] = url data["number"] = no[0].replace('\n\t\t\t\t\t\t\t', '') data["lang"] = lang data["lisence_class"] = lisence_class[0].replace( '\n\t\t\t\t\t\t\t', '') data["lisence"] = lisence[0].replace('\n\t\t\t\t\t\t\t', '') data["notes"] = notes[0].replace('\n\t\t\t\t\t\t\t', '') print release if not release: release = [] else: for kinds in release: kindarr = kinds.split(',') for kind in kindarr: print kind if kind.find('Other') != -1: data["content"] = kind if kind.find('UML') != -1 or kind.find( 'UMM') != -1 or kind.find( 'BPMN') != -1 or kind.find('FMC') != -1: data["Model"] = kind if kind.find('XML Schema') != -1 or kind.find( 'Relax NG') != -1 or kind.find( 'Schematron') != -1 or kind.find( 'WSDL') != -1 or kind.find( 'ebXML Process Def') != -1: data["Syntax"] = kind if kind.find('Codelists') != -1 or kind.find( 'Mappings') != -1 or kind.find( 'Taxonomy') != -1 or kind.find( 'Ontology') != -1: data["Semantic"] = kind if kind.find('Core Components') != -1: data["Abstract"] = kind print 'url :' + url if not data: print 'data is empty' else: print data scraperwiki.datastore.save(["link"], data, table_name="Release")
soup = BeautifulSoup(response.text) links = soup.findAll('input',{'id':'mail'}) temp_mail_id = links[0].get('value') return temp_mail_id #def account_creator(): E_MAIL = email_id_fetcher() resp_signup = requests.get(twitter_url) tree= html.fromstring(resp_signup.text) #form submission links_forms = tree.forms[3] links_forms.fields['user[name]'] = 'tony.blair.righthand1' links_forms.fields['user[user_password]'] = 'Great_wall_2' links_forms.fields['user[email]'] = E_MAIL new_response = requests.get(submit_form(tree.forms[3])) ##links_forms.fields = { ## 'user[name]':'tony.blair.righthand1', ## 'user[user_password]':'Great_wall_2', ## 'user[email]' : bytes(E_MAIL), #### 'authenticity_token' : tree.forms[3].fields['authenticity_token'] ## } new_response = requests.get(submit_form(tree.forms[3]))
def releasesData(url): page = parse(url).getroot() result = parse( submit_form(page.forms[2], extra_values={'j_id82:j_id510': 'Release downloads'})).getroot() data = [ a.attrib['href'] for a in result.xpath("//table[@class='releases']/tbody/tr/td[1]/a") ] print data for link in data: release_page = parse('http://www.semic.eu/semic/view/Asset/' + link).getroot() #Publication date and state pub_date_state = [ a.text for a in release_page.xpath( "//div[@class='releaseDownloads']/div/div[@class='alignLeft widthAuto']" ) ] #release No no = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[1]") ] print no[0].replace('\\t', '') print no[0].replace('\\n', '') #Documentation language(s) lang = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[3]/a/span") ] #Release Contents release = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[4]") ] #Lisence Class lisence_class = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[5]") ] #Lisence lisence = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[6]") ] #Notes notes = [ a.text for a in release_page.xpath( "//dl[@class='assetDefinition clearfix']/dd[9]") ] data = { "pub_date_state": pub_date_state[0].replace('\\n', '').replace('\\t', ''), "url": url, "number": no[0].replace('\\n', '').replace('\\t', ''), "lang": lang, "release": release, "lisence_class": lisence_class[0].replace('\\n', '').replace('\\t', ''), "lisence": lisence[0].replace('\\n', '').replace('\\t', ''), "notes": notes[0].replace('\\n', '').replace('\\t', '') } scraperwiki.datastore.save([ "pub_date_state", "url", "number", "lang", "release", "lisence_class", "lisence", "notes" ], data, table_name="release")
states = [] districts = [] pumpdump = [] jsonfile = open("address2.json", 'w') opener = urlopen("https://www.iocl.com/Retails.aspx") mainpage = html.parse(opener).getroot() states = mainpage.xpath("//select[@id='cmbState']/option/@value") states.pop(0) for state in states: mainpage.forms[0].fields['cmbState'] = state distpage = html.parse(html.submit_form(mainpage.forms[0])).getroot() print "Processing state: " + state districts = distpage.xpath("//select[@id='cmbDistrict']/option/@value") districts.pop(0) for district in districts: distpage.forms[0].fields['cmbDistrict'] = district try: resultpage = html.parse(html.submit_form(distpage.forms[0])).getroot() except UnicodeEncodeError: print "!!!!!ERROR!!!!!" + state + district break; for row in range(1,50): pump = Address()