def parse_from(self, response): selector = Selector(response) stop_ids = selector.css('#confirm1_ddlTravellingTo option[value!="-1"]::attr(value)').extract() for stop_id in stop_ids: request = FormRequest.from_response(response, formname='ctl01', formdata={ 'confirm1$ddlLeavingFromMap': response.meta['stop_from'], 'confirm1$ddlTravellingTo': stop_id, 'confirm1$btnSearch': 'Search', }, callback=self.parse_to_stop) request.meta['stop_to'] = stop_id request.meta['stop_from'] = response.meta['stop_from'] yield request # reapeat to get the first stop (from) request = FormRequest.from_response(response, formname='ctl01', formdata={ 'confirm1$ddlLeavingFromMap': response.meta['stop_from'], 'confirm1$ddlTravellingTo': stop_id, 'confirm1$btnSearch': 'Search', }, callback=self.parse_from_stop) request.meta['stop_to'] = stop_id request.meta['stop_from'] = response.meta['stop_from'] yield request
def listing_parser(self, response): """ Given a paginated listing of alumni, parse members and reparse individual page and rerun on next pages """ # Get all pages x = HtmlXPathSelector(response) # if has numbered responses pages_hrefs = [i.extract().split("'")[1] for i in x.select("//div[contains(@class,'rgNumPart')]/a/@href")] pages = x.select("//div[contains(@class,'rgNumPart')]/a") requests = [] # if the first link is a previous pages, delete it if "Previous Pages" in pages[0].extract(): del pages_hrefs[0] # if next pages, pop and rerun listing_parser on it if "Next Pages" in pages[-1].extract(): requests.append(FormRequest.from_response(response, formdata = {'__EVENTTARGET' : pages_hrefs.pop(), }, callback=self.listing_parser)) # parse each page including the first with scraper for href in pages_hrefs: requests.append(FormRequest.from_response(response, formdata = {'__EVENTTARGET' : href, }, callback=self.listing_scraper)) return requests
def parse_study(self,response): study = response.meta['study'] areaId = response.meta['areaId'] sel_options = response.xpath('//select[@name="disciplineId"]/option') study_desc = response.xpath('//p[@class="tablehead"]/following::p/text()')[0].extract() discipline_form_requests = [] item = StudyItem() item['name'] = study item['description'] = study_desc for sel in sel_options[1::]: discipline = sel.xpath('text()').extract()[0] disciplineId = sel.xpath('@value').extract()[0] discipline_form_request = FormRequest( url='http://www2.assist.org/exploring-majors/findDiscipline.do', formdata={'areaId':str(areaId), 'disciplineId':str(disciplineId)}, callback=self.parse_discipline) discipline_form_request.meta['study'] = study discipline_form_request.meta['discipline'] = discipline discipline_form_requests.append(discipline_form_request) return discipline_form_requests + [item]
def parse_item(self, response): hxs = HtmlXPathSelector(response) items = hxs.select("//div[@id='ProductDetails']/div[@class='BlockContent']") for item in items: title = item.select("h2/text()").extract()[0] url = response.url product_id = item.select( "div[@class='ProductMain']/div[@class='productAddToCartRight']/\ form[@id='productDetailsAddToCartForm']/input[@name='product_id']/@value").extract()[0] select_el = item.select( "div[@class='ProductMain']/div[@class='productAddToCartRight']/\ form[@id='productDetailsAddToCartForm']/div[@class='ProductDetailsGrid ProductAddToCart']/\ div[@class='productAttributeList']/div/\ div[@class='productAttributeValue']/div[@class='productOptionViewSelect']/select") field_name = select_el.select("@name").extract()[0] options = select_el.select('option') for option in options: option_name = option.select("text()").extract()[0] option_value = option.select("@value").extract()[0] if not option_value: continue item_options = json_api_request_args.copy() item_options[field_name] = option_value item_options['product_id'] = product_id new_item_name = title + " " + option_name request = FormRequest( url=json_api_url, formdata=item_options, callback=self._parse_item_json ) request.meta['item_name'] = new_item_name request.meta['item_url'] = url request.meta['subtype_id'] = option_value yield request
def build_form_request(self, search_term, formdata): form_request = FormRequest("https://searchwww.sec.gov/EDGARFSClient/jsp/EDGAR_MainAccess.jsp", callback=self.parse_search_results_follow_next_page, formdata=formdata) form_request.meta['search_term'] = search_term form_request.meta['page_num'] = 1 return form_request
def parse(self, response): # deleting files: try: if os.path.exists("newPost.txt"): os.remove("newPost.txt") if os.path.exists("newgetLinks.txt"): os.remove("newgetLinks.txt") if os.path.exists("scrappedurls.txt"): os.remove("scrappedurls.txt") except: pass # print "Status:",response.status # print "Request Headers" # print response.request.headers.items() # print "\n\n" # print "Response Headers" # print response.headers.items() # print "\n\n" login_user = self.credentials[response.request.url][0] print login_user login_pass = self.credentials[response.request.url][1] print login_pass args, url, method, name, number = fill_login_form(response.url, response.body, login_user, login_pass) if name: yield FormRequest.from_response( response, method=method, formdata=args, formname=name, callback=self.after_login ) else: yield FormRequest.from_response( response, method=method, formdata=args, formnumber=number, callback=self.after_login )
def parse(self, response): # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass) #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login) args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass) credentials = list() tmpparam = dict() print args for a in args: if a[0].find("user") > -1 or a[0].find("admin") > -1: tmpparam["userid"] = a[0] if a[0].find("password") > -1: tmpparam["passwordid"] = a[0] # tmpparam["submit"] = "submit" tmpparam["url"] = self.start_urls[0] tmpparam["login"] = "" credentials.append(tmpparam) f = open("json/credentials.json", 'w') f.write(json.dumps(credentials,indent= 4, sort_keys = True)) f.close() if name: yield FormRequest.from_response(response, method=method, formdata=args, formname=name, dont_filter=True,callback=self.after_login) else: yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number, dont_filter=True,callback=self.after_login)
def start_requests(self): scraperwiki.sql.execute('DROP TABLE IF EXISTS subsidaries') for i in range(1, 16000): uid = '91/{:05d}'.format(i) request = FormRequest('https://www.ird.gov.hk/charity/view_detail.php', formdata={'org_id':uid}, callback=self.parse_page) request.meta['uid'] = uid yield request
def parse_start_url(self, response): base_data = { 'name': '', 'country': '', '_state': '1', 'language': '0', 'qualification': '0--', 'level': '0', 'speciality': '0', '_isTrainer': 'on' } data = base_data.copy() data['profession'] = str(PROFESSION) form_request = FormRequest.from_response(response, formdata=data) return form_request requests = [] for profession in PROFESSIONS: data = base_data.copy() data['profession'] = str(PROFESSION) form_request = FormRequest.from_response(response, formdata=data) requests.append(form_request) return requests
def parseVoteSearchResults(self, response): sel = Selector(response) validationstr = self.getValidationString(response) # Parse the first page of results for voteItem in self.parseVoteTableResults(response): yield voteItem pages = 1 # Grab the vote table voteTable = sel.css('#SelectVoteHistoryView_GridView1') rows = voteTable.css('tr') # The last row contains the page links paginationRow = rows[-1] firstCellElement = paginationRow.css('td>span::text') if not firstCellElement: # Can't find the navigate bar?? return firstCellContent = firstCellElement.extract()[0] # Check if there are any pages.. if str(firstCellContent).isdigit(): thisPage = int(firstCellContent)-1 # The last cell contains some js to skip to the final page lastCell = paginationRow.css('td')[-1] lastPageElem = lastCell.css('a::attr(onclick)') if len(lastPageElem) != 1: # We're on the last set of pages, so there's no last page link return lastPageLink = lastPageElem[0].extract() # Need to pull page number out of this: javascript:__gvSelectVoteHistoryView_GridView1.callback("69|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|"); return false; # It's the first value in the param string # TODO: Error handling... pages = int(re.split('\"|\|',lastPageLink)[1]) + 1 # Only iterate over pages if this is a new table if response.meta['fetch']: for i in range(thisPage,min([pages,thisPage+11])): fetch = False if i == thisPage + 10: fetch = True if validationstr: # The fact we have a validationstr means we need a surrogate form (this is a callback) form = response.meta['form'] yield FormRequest.from_response(form, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1', '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI=', '__EVENTVALIDATION':validationstr},meta={'fetch':fetch,'form':form}) else: # Should only get here if this is first page #inspect_response(response) yield FormRequest.from_response(response, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1', '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI='},meta={'fetch':fetch,'form':response})
def search(self, response): # Fill in form (search field) with default term "data scientist" and search. data_form = None if 'data_form' in response.meta.keys(): data_form = response.meta['data_form'] else: data_form = self.get_input_data(self.input_file) if not data_form: return elif len(data_form) == 1: key, value = data_form.popitem() return FormRequest.from_response(response, formdata={"newNtk": key, "newNtt": value}, callback=self.parse_results) else: key, value = data_form.popitem() form_request = FormRequest.from_response(response, formdata={"newNtk": key, "newNtt": value}, callback=self.search) form_request.meta['data_form'] = data_form return form_request
def parse(self, response): state_codes=['AZ','TX'] for state in state_codes: request=FormRequest(url='http://nssf.org/retailers/find/index.cfm', formdata={'txtState':state,'Submit':'Submit'}, callback=self.result_page) request.meta['state']=state yield request
def parse(self, response): # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass) #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login) args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass) if name: yield FormRequest.from_response(response, method=method, formdata=args, formname=name, dont_filter=True,callback=self.after_login) else: yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number, dont_filter=True,callback=self.after_login)
def parse(self,response): for n in range(2006,2016): for y in range(1,13): formdata={'nian': str(n), 'yue1': str(y),'yue2':str(y),'tijiao':'ч╗Я шоб'} req= FormRequest(url = 'http://www.qyfgj.cn/gz/ti_result.asp', formdata = formdata,callback=self.parseData) req.meta['date']=datetime(n,y,1) yield req
def test_from_response_override_method(self): response = _buildresponse( '''<html><body> <form action="/app"></form> </body></html>''') request = FormRequest.from_response(response) self.assertEqual(request.method, 'GET') request = FormRequest.from_response(response, method='POST') self.assertEqual(request.method, 'POST')
def start_requests(self): for i, id in enumerate(self.items): #if i > 9: break url = self.link_template + id request = FormRequest(url, headers = self.headers, cookies =self.cookies, callback = self.parse_item) request.meta['id'] = id yield request
def test_from_response_override_method(self): response = _buildresponse( """<html><body> <form action="/app"></form> </body></html>""" ) request = FormRequest.from_response(response) self.assertEqual(request.method, "GET") request = FormRequest.from_response(response, method="POST") self.assertEqual(request.method, "POST")
def __init__(self, url=None, callback=None, method=None, formdata=None, body=None, **kwargs): # First init FormRequest to get url, body and method if formdata: FormRequest.__init__( self, url=url, method=method, formdata=formdata) url, method, body = self.url, self.method, self.body # Then pass all other kwargs to SplashRequest SplashRequest.__init__( self, url=url, callback=callback, method=method, body=body, **kwargs)
def generate_new_category_request(self,index): if index < len(self.values): request = FormRequest(url=self.result_url.replace("results", "refine_results"), formdata={'profile[electricity_plan_type]' : str(self.values[index]), 'profile[discounts][EP]' : '1', 'profile[discounts][PP]' : '1'}, callback=self.step_results, dont_filter=True) request.meta['next'] = index+1 return request return None
def test_from_response_override_url(self): response = _buildresponse( '''<html><body> <form action="/app"></form> </body></html>''') request = FormRequest.from_response(response) self.assertEqual(request.url, 'http://example.com/app') request = FormRequest.from_response(response, url='http://foo.bar/absolute') self.assertEqual(request.url, 'http://foo.bar/absolute') request = FormRequest.from_response(response, url='/relative') self.assertEqual(request.url, 'http://example.com/relative')
def get_media_requests(self, item, info): if item['attach']: log.msg('-----++++++++++++++') if item['attach'][0]=='post': req = FormRequest( url=item['attach'][1], formdata=item['attach'][2] ) req.meta['item']=item return [req] # yield Request(link) return;
def parse_zhaopin(self, response): sel = Selector(response) regex_rule = r'http://www.lagou.com/zhaopin/(.*?)/' result = re.match(regex_rule, response.url) formdata = {'first' : 'false', 'pg': str(1)} if result: formdata['kd'] = result.group(1) request = FormRequest(url=self.json_url, method='POST', formdata=formdata, callback=self.parse_first_json) request.meta['kd'] = formdata['kd'] return request
def parse_browse(self,response): sel_options = response.xpath('//select[@name="areaId"]/option') study_form_requests = [] for sel in sel_options[1::]: study = sel.xpath('text()').extract()[0] areaId = sel.xpath('@value').extract()[0] study_form_request = FormRequest( url='http://www2.assist.org/exploring-majors/findAreaOfStudyOverview.do', formdata={'areaId':str(areaId)}, callback=self.parse_study) study_form_request.meta['study'] = study study_form_request.meta['areaId'] = areaId study_form_requests.append(study_form_request) return study_form_requests
def parse(self, response): return FormRequest.from_response( response, formdata=data, method="GET", callback=self.search_result )
def login(self, response): with open(self.login_path, encoding='utf8') as f: login_data = json.load(f) yield FormRequest.from_response(response, formdata=login_data, formid='login-nav', callback=self.check_login)
def after_login(self, response): if 'BIN Lookup' in response.body: for binnum in self.bins: binnum = binnum.strip() yield FormRequest.from_response( response, formdata={'search': binnum}, callback=self.got_bin)
def parse(self, response): if 'login' in response.url: return [FormRequest.from_response( response, formdata={'j_username': self.username, 'j_password': self.password}, callback=self.after_login)]
def login(self, response): self._log_page(response, 'amazon_login.html') return [FormRequest.from_response(response, \ formdata = self.formdata,\ headers = self.headers,\ meta = {'cookiejar':response.meta['cookiejar']},\ callback = self.parse_item)]#success login
def parse(self, response): yield FormRequest.from_response( response, formname='frmConsulta', formdata={'data':'(@DTDE >="'+self.iDate+'") E (@DTDE <="'+self.fDate+'")', 'b':'ACOR'}, callback=self.parseSearchResults )
def course_terms_page(self, response): for term in self.terms: yield FormRequest.from_response( response, formxpath='/html/body/div[5]/form', formdata={'TERM': term}, meta={'term': term})
def traffic_fines_details(self, response): """Fines page with details Chose the records between start_date and end_date If not specified then choose all reqords.""" renavam = response.selector.xpath( "//span[@id='lblRenavam']/text()").get("").strip() placa = response.selector.xpath("//span[@id='lblPlaca']/text()").get( "").strip() file_type = self.remove_diacritics( response.selector.xpath("//span[@id='LblCabecalho01']/text()").get( "").strip()) print("renavam:", renavam) print("placa:", placa) print("file_type:", file_type) # Get options for request EVENTTARGET = response.selector.xpath( "//input[@id='__EVENTTARGET']/@value").get("") EVENTARGUMENT = response.selector.xpath( "//input[@id='__EVENTARGUMENT']/@value").get("") PageProdamSPOnChange = response.selector.xpath( "//input[@id='PageProdamSPOnChange']/@value").get("") PageProdamSPPosicao = response.selector.xpath( "//input[@id='PageProdamSPPosicao']/@value").get("") PageProdamSPFocado = response.selector.xpath( "//input[@id='PageProdamSPFocado']/@value").get("") VIEWSTATE = response.selector.xpath( "//input[@id='__VIEWSTATE']/@value").get("") VIEWSTATEGENERATOR = response.selector.xpath( "//input[@id='__VIEWSTATEGENERATOR']/@value").get("") EVENTVALIDATION = response.selector.xpath( "//input[@id='__EVENTVALIDATION']/@value").get("") btnGerarDocumento = response.selector.xpath( "//input[@id='btnGerarDocumento']/@value").get("") txthvalor_total = response.selector.xpath( "//input[@id='txthvalor_total']/@value").get("") txthqtd_total = response.selector.xpath( "//input[@id='txthqtd_total']/@value").get("") frm_data = { 'PageProdamSPOnChange': PageProdamSPOnChange, 'PageProdamSPPosicao': PageProdamSPPosicao, 'PageProdamSPFocado': PageProdamSPFocado, '__EVENTTARGET': EVENTTARGET, '__EVENTARGUMENT': EVENTARGUMENT, '__VIEWSTATE': VIEWSTATE, '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR, '__EVENTVALIDATION': EVENTVALIDATION, 'chkSelecionarTodos': 'on', 'btnGerarDocumento': btnGerarDocumento, 'txthvalor_total': txthvalor_total, 'txthqtd_total': txthqtd_total } rows = response.selector.xpath("//table[@id='grdDados']//tr[@class]") all_rows_data = [] for row in rows: # check if infringement_date between start_date and end_date # if they are not specified then get all records infringement_date = row.xpath(".//td[6]/text()").get("").strip() infringement_datetime = dt.strptime(infringement_date, "%d/%m/%Y") if self.start_date <= infringement_datetime <= self.end_date: # choose the record chkMulta = row.xpath(".//td[1]/span/input/@name").get("") hdnSituacaoPPM = row.xpath(".//td[1]/input/@name").get("") frm_data.update({chkMulta: "on", hdnSituacaoPPM: ""}) # get fields notification = row.xpath(".//td[3]/text()").get("").strip() infringement = row.xpath(".//td[4]/text()").get("").strip() description = row.xpath(".//td[5]/text()").get("").strip() infringement_time = row.xpath(".//td[7]/text()").get( "").strip() location = row.xpath(".//td[8]/text()").get("").strip() due_date = row.xpath(".//td[9]/text()").get("").strip() value = row.xpath(".//td[10]/span/text()").get("").strip() debt_situation = row.xpath(".//td[11]/text()").get("").strip() installment_code = row.xpath(".//td[12]/text()").get( "").strip() situation_description = row.xpath(".//td[13]/text()").get( "").strip() date = row.xpath(".//td[14]/text()").get("").strip() row_data = { "notificacao": notification, "auto_infracao": infringement, "descricao": description, "data_infracao": infringement_date, "hora": infringement_time, "local_da_infracao": location, "vencimento": due_date, "valor": value, "situacao_na_divida_ativa": debt_situation, "codigo_do_parcelamento": installment_code, "descricao_da_situacao": situation_description, "data": date } all_rows_data.append(row_data) # add data to result if all_rows_data: self.result.update({file_type: all_rows_data}) # check if get_files is True if self.get_files: report_url = "https://meuveiculo.prefeitura.sp.gov.br/forms/frmResumoMultasDetalhe.aspx" yield FormRequest(url=report_url, formdata=frm_data, meta={ "file_type": "boleto", "result_key": file_type, "notification": notification }, callback=self.report_table, dont_filter=True) else: error_msg = "traffic_fines_details doesn't contain any data." self.logger.warning(error_msg)
def parse_product(self, response): """ Main parsing product method """ reqs = [] product = response.meta['product'] #self._populate_from_html(response, product) #self._populate_from_js(response, product) # Product ID id = re.findall('\/(\d+)', response.url) product_id = id[-1] if id else None response.meta['product_id'] = product_id if response.status in self.default_hhl: product = response.meta.get("product") product.update({"locale": 'en_CA'}) return product self._populate_from_js(response, product) # Send request to get if limited online status try: skus = [{"skuid": sku} for sku in response.meta['skus']] request_data = [{ "productid": product_id, "skus": [skus] }] request_data = json.dumps(request_data).replace(' ', '') reqs.append(FormRequest( url="http://www.walmart.ca/ws/online/products", formdata={"products": request_data}, callback=self._parse_online_status, headers={ 'X-Requested-With': 'XMLHttpRequest' } )) except KeyError: pass if response.xpath('//span[@class="infoText"]/' \ 'text()').re('This product is not available'): product['no_longer_available'] = True self._populate_from_html(response, product) cond_set_value(product, 'locale', 'en_CA') # Default locale. # Get featured products from generated JS script, evaluating parent script RR_entity = RR( response.url, product_id, response ) featured_products_url = RR_entity.js() reqs.append(Request( url=featured_products_url, callback=self._parse_related_products )) # Get product base info, QA and reviews straight from JS script product_info_url = self.PRODUCT_INFO_URL.format(product_id=product_id) reqs.append(Request( url=product_info_url, callback=self._parse_product_info )) regex = "\/(\d+)\??" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) if reqs: return self.send_next_request(reqs, response) return product
def start_requests(self): return [FormRequest("https://slashdot.org/my/login", formdata={"op": "userlogin", "returnto": "", "unickname": "<username>", "upasswd": "<passwd>", "userlogin": "******"})]
def get_recaptchaClientToken(self, response): resp_cnt = response.meta['resp_cnt'] print("\t[{}] {}".format(resp_cnt, response.text)) if response.text.split('|')[0] == 'OK': self.recaptchaClientToken = response.text.split('|')[1] headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Content-Type': 'application/x-www-form-urlencoded', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '******', 'cookie': self.cookie } payload = { '__RequestVerificationToken': self.__RequestVerificationToken, 'Registreringsnummer': self.Registreringsnummer, 'recaptchaClientToken': self.recaptchaClientToken, 'Captcha.CaptchaResponse': '' } request = FormRequest(url=self.post_url, method='POST', headers=headers, formdata=payload, callback=self.get_details, errback=self.fail_details, dont_filter=True, meta={}) yield request else: resp_cnt += 1 if resp_cnt >= self.max_resp_cnt: formdata = { 'key': self.api_key, 'method': 'userrecaptcha', 'googlekey': self.google_key, 'pageurl': self.post_url, } headers = make_headers_1() headers['cookie'] = self.cookie request = FormRequest(url=self.captcha_in_url, method='POST', formdata=formdata, headers=headers, callback=self.get_captcha_id, errback=self.fail_captcha_id, dont_filter=True, meta={}) yield request else: sleep(self.resp_time) fetch_url = self.captcha_res_url.format(self.captcha_id) request = FormRequest(url=fetch_url, method='GET', headers=make_headers_1(), callback=self.get_recaptchaClientToken, errback=self.fail_recaptchaClientToken, dont_filter=True, meta={ 'resp_cnt': resp_cnt, }) yield request
def send_myapks_request(url, **kwargs): apk_name = kwargs['apk_name'] return FormRequest(url, method='GET', meta=kwargs, callback=get_myapks_search_formdata)
def parse(self, response): # print self.start_urls #open_in_browser(response) allyears = ret0IfExist(Selector(response).xpath(YEARS_XPATH).extract()) curyear = Selector(response).xpath(CURYEAR_XPATH).extract() if (curyear[0] and len(curyear[0])): curyear = curyear[0] curyear = curyear[:4] if (len(curyear) == 0): return data = Selector(response).xpath(TEXT_XPATH8).extract() #print(data) if (len(data) == 0): data = Selector(response).xpath(TEXT_XPATH6).extract() if (len(data) == 0): return data = ret0IfExist(data) # print(allyears); # print(curyear); #print(data); conn = MySQLdb.connect(user='******', passwd='stocks_pass', db='stocks', host='localhost', charset="utf8", use_unicode=True) cursor = conn.cursor() finData = MoneycontrolItem() finData['symbol'] = str(response.meta['symbol']) finData['type'] = 'ds' # directors speech finData['year'] = str(curyear) finData['data'] = data print('inserting') print(finData['symbol']) print(finData['type']) print(finData['year']) print('trying now') try: cursor.execute( "insert into stock_txt_data (symbol,data,type,year) values (%s,%s,%s,%s)", (finData['symbol'], finData['data'], finData['type'], finData['year'])) except: print("Unexpected error:", sys.exc_info()) cursor.close() #print(finData); conn.commit() if (allyears is None): return allyears = allyears.split(",") #all the other years have already been yielded in GET request #so post request should not yield more requests if (response.meta['posted']): return for y in allyears: if (len(y) == 0): continue try: tempy = int(y[:4]) y = int(y) curyear = int(curyear) if (tempy <= curyear): continue except: continue headers = { 'content-type': "application/x-www-form-urlencoded", 'cache-control': "no-cache", 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/50.0.2661.102 Chrome/50.0.2661.102 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://www.moneycontrol.com/annual-report/infosys/chairmans-speech/IT', } formdata = { 'sel_year': str(y), 'sc_did': str(response.meta['mc_symbol']) } print(formdata) response.meta['posted'] = True # use this flag to post again or not yield FormRequest(response.url, method='POST', meta=response.meta, headers=headers, formdata=formdata, callback=self.parse)
def parse_product(self, response): products = response.xpath( '//div[contains(@class, "product")]//a[div[@class="name"]]/@href' ).extract() if products: for product in products: yield Request(response.urljoin(product), callback=self.parse_product) pages = response.xpath( '//a[contains(@class, "pageNumber")]/text()').extract() for page in pages: page = response.urljoin(page) yield Request(page) return name = response.xpath('//div/h1/text()').extract() try: price = response.xpath( '//div[@class="bigprice GBP"]/@data-price').extract()[0] except IndexError: for p in self.parse(response): yield p return brand = '' categories = response.xpath( '//ul[@class="breadcrumb"]/li/a/text()').extract()[1:] l = ProductLoader(item=Product(), response=response) image_url = response.xpath('//div[@id="mainImage"]/img/@src').extract() image_url = response.urljoin(image_url[0]) if image_url else '' l.add_value('image_url', image_url) l.add_value('url', response.url) l.add_value('name', name) l.add_value('price', extract_price(price)) l.add_value('brand', brand) l.add_value('category', categories) sku = response.xpath('//p[@class="partcode"]/text()').re( 'Quick Code: (.*)') sku = sku[0] if sku else '' l.add_value('sku', sku) l.add_xpath('identifier', '//input[@name="product_id"]/@value') item = l.load_item() promotions = response.xpath( '//div[contains(@class, "price_box")]//div[@class="GBP"]/span[@class="desktop_rrp" or @class="saving"]/text()' ).extract() corner_promotion = response.xpath( '//img[@class="cornerflash"]/@src').re('Empire/(.*).png') corner_promotion = corner_promotion[0] if corner_promotion else '' corner_promotions = { 'pricedrop': 'Price Drop', 'deal': 'Deal', 'freedel': 'Free Delivery', 'newarrival': 'New Arrival', 'sale': 'Sale', 'bestseller': 'Bestseller', 'wasteincluded': 'Waste Included', 'trayincluded': 'Tray Included', 'clearance': 'Clearance', 'pricedropred': 'Price Drop', 'asseenontv': 'As Seen On T.V' } metadata = MetaData() metadata['corner_promotion'] = corner_promotions.get( corner_promotion, '') metadata['Promotions'] = ' '.join(promotions) if promotions else '' item['metadata'] = metadata stock_url = "http://soak.com/includes/ajax/in_stock.php" part_code = response.xpath( '//div[contains(@class, "stock_report")]/@data-partcode').extract( )[0] manufacturers_id = response.xpath( '//div[contains(@class, "stock_report")]/@data-manufacturers_id' ).extract()[0] formdata = { 'action': 'in_stock', 'manufacturers_id': manufacturers_id, 'part_code': part_code } yield FormRequest(stock_url, formdata=formdata, callback=self.parse_stock, meta={'item': item})
def start_requests(self): for contentID in self.crawlID: self.finishID.add(contentID) self.SelectedID = contentID return [FormRequest(url = "http://weibo.cn/search/", formdata = {'keyword': contentID, 'smblog': '搜微博'}, callback=self.parse_Content)]
def parse_make(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] form = hxs base_data = self.get_post_data(response) base_data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ValidCar'] = 'rbYes' base_data[ '__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$lnkbtnSizeGo2' base_data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeL'] = row[ 'IP code'] base_data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlSpeedL'] = row[ 'Speed rating'] base_data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlDiameterL'] = row[ 'Rim'] base_data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlProfileL'] = row[ 'Aspect Ratio'] base_data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL'] = row[ 'Width'] base_data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeL'] = '2151953' rewrite_keys = [ ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL', 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvWidth'), ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlProfileL', 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvProfile'), ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlDiameterL', 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvDiameter'), ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlSpeedL', 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvSpeed'), ] for field1, field2 in rewrite_keys: base_data[field2] = base_data[field1] if not self.makes: makes = form.select( ".//select[@id='ddlMake']/option/@value").extract() self.makes = [x for x in makes if x != '0'] for i, make in enumerate(self.makes): if make in self.current_row_processed_makes: continue self.log("Crawling row for make: %s" % make) self.current_row_processed_makes.add(make) data = base_data.copy() data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlMake'] = make req = FormRequest(response.url, formdata=sorted(data.items()), callback=self.pre_parse_search, dont_filter=True, meta={ 'row': response.meta['row'], 'formdata': data }) yield req return self.prepare_for_next_row()
def parse(self, response): row = self.current_row if not row: row = self.get_next_row() if not row: self.done = True return self.log("[CARTYRES] Searching row: %s" % str(row)) hxs = HtmlXPathSelector(response) form = hxs.select("//form[@id='form1']") data = {} for el in form.select(".//input"): key = el.select("@name").extract() value = el.select("@value").extract() if key: key = key[0] if not key.startswith('__'): continue if value: value = value[0] else: value = '' data[key] = value for el in form.select(".//select"): key = el.select("@name").extract() value = el.select(".//option[@selected]/@value").extract() if key: key = key[0] if value: value = value[0] else: value = '' data[key] = value data['__ASYNCPOST'] = 'true' data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ValidCar'] = 'rbYes' data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlMake'] = '0' data[ '__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL' data['ctl00$sp'] = \ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$upPanel|ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL' # use row to fill in data data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL'] = row[ 'Width'] data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlDiameterL'] = row[ 'Rim'] data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlProfileL'] = row[ 'Aspect Ratio'] data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlSpeedL'] = 'V' data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvDiameter'] = "" data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvProfile'] = "" data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvSpeed'] = "" data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvWidth'] = "" data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeL'] = "" data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeR'] = "" data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtRegNo'] = "" data[ 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtValidRegNo'] = "" req = FormRequest(response.url, formdata=sorted(data.items()), callback=self.parse2, errback=self.error_callback, dont_filter=True, meta={ 'row': row, 'formdata': data }) yield req
def parse_form(self, response): yield FormRequest.from_response(response, formdata={'RdoTimeLimit': '42'}, dont_filter=True, formxpath='(//form)[2]', callback=self.parse_pages)
def input_process(self, response): action = response.xpath('//form[@name="Bkkn001Form"]/@action').extract()[0] req_url = urljoin(response.url, action) token = response.xpath('//input[@name="org.apache.struts.taglib.html.TOKEN"]/@value').extract()[0] self.headers['Referer'] = response.url random_id = response.xpath('//input[@id="randomID"]/@value').extract()[0] seni_gen_gameen_id = response.xpath('//input[@name="seniGenGamenID"]/@value').extract()[0] form_data = { 'org.apache.struts.taglib.html.TOKEN': token, 'randomID': random_id, 'contextPath': '/reins', 'event': 'forward_searchbabi', 'bbTtKbn': '1', 'stateMode': '', 'stWttBg': '', 'hzMi': '', 'zkSyKbn': '2', 'stJyk': '', 'bkknShmk1': '', 'bkknShmk2': '', 'bkknShbt1': '03', 'bkknShmkDispList1': '', 'bkknShmkDispList1': '', 'bkknShmkDispList1': '', 'bkknShmkDispList1': '', 'bkknShmkDispList1': '', 'bkknShmkDispList1': '', 'bkknShbt2': '', 'bkknShmkDispList2': '', 'bkknShmkDispList2': '', 'bkknShmkDispList2': '', 'bkknShmkDispList2': '', 'bkknShmkDispList2': '', 'shtkChkKbnShti': '0', 'shkcknShriSti': '0', 'shgUmKbn': '1', 'trhkJyukyu': '0', 'tdfkMi1': '東京都', 'shzicmi1_1': '', 'shzicmi2_1': '', 'shzicJyk_1': '1', 'ttmnmi_1': '', 'ttmnJyk_1': '1', 'tdfkMi2': '', 'shzicmi1_2': '', 'shzicmi2_2': '', 'shzicJyk_2': '1', 'ttmnmi_2': '', 'ttmnJyk_2': '1', 'tdfkMi3': '', 'shzicmi1_3': '', 'shzicmi2_3': '', 'shzicJyk_3': '1', 'ttmnmi_3': '', 'ttmnJyk_3': '1', 'ensnmi1': '', 'ekmiFrom1': '', 'ekmiTo1': '', 'thNyrkt1': '', 'thMbKbn1': '', 'krmKm1': '', 'bsB1': '', 'ensnmi2': '', 'ekmiFrom2': '', 'ekmiTo2': '', 'thNyrkt2': '', 'thMbKbn2': '', 'krmKm2': '', 'bsB2': '', 'ensnmi3': '', 'ekmiFrom3': '', 'ekmiTo3': '', 'thNyrkt3': '', 'thMbKbn3': '', 'krmKm3': '', 'bsB3': '', 'bsRsmi': '', 'bsTmiSh': '', 'tihNyrkt': '', 'tihMbKbn': '', 'sotKtu': '', 'sotKtuNyrkt': '', 'sotKtuMbKbn': '', 'kkkuCnryuFrom': '', 'kkkuCnryuTo': '', 'siykKkkuCnryuFrom': '', 'siykKkkuCnryuTo': '', 'tbTnkFrom': '', 'tbTnkTo': '', 'siykTbTnkFrom': '', 'siykTbTnkTo': '', 'tcMnskFrom': '', 'tcMnskTo': '', 'ttmnMnskFrom': '', 'ttmnMnskTo': '', 'snyuMnskFrom': '', 'snyuMnskTo': '', 'mdrHysuFrom': '', 'mdrHysuTo': '', 'shzikiFrom': '', 'shzikiTo': '', 'blcnyHuku': '', 'stdoHuku': '', 'stdoJyukyu': '', 'stdoStmn': '', 'stdoFkin': '', 'tskikk': '', 'yutCik': '', 'sitkYut': '', 'ktcJok': '', 'chushjyuZih': '', 'cknngtYearFrom': '', 'cknngtMonthFrom': '', 'cknngtYearTo': '', 'cknngtMonthTo': '', 'kjkrngGgFrom': 'R', 'kjkrngYearFrom': '', 'kjkrngMonthFrom': '', 'kjkrngGgTo': 'R', 'kjkrngYearTo': '', 'kjkrngMonthTo': '', 'optId': '', 'strStbJok': '', 'bk1': '', 'shuhnKnkyu': '', 'turkKknFlg': '1', 'turkNngppGgFrom': 'R', 'turkNngppNenFrom': '', 'turkNngppGatuFrom': '', 'turkNngppHiFrom': '', 'turkNngppGgTo': 'R', 'turkNngppNenTo': '', 'turkNngppGatuTo': '', 'turkNngppHiTo': '', 'hcKknFlg': '1', 'hnkuNngppGgFrom': 'R', 'hnkuNngppNenFrom': '', 'hnkuNngppGatuFrom': '', 'hnkuNngppHiFrom': '', 'hnkuNngppGgTo': 'R', 'hnkuNngppNenTo': '', 'hnkuNngppGatuTo': '', 'hnkuNngppHiTo': '', 'siykKknFlg': '5', 'siykNngppGgFrom': 'R', 'siykNngppNenFrom': '1', 'siykNngppGatuFrom': '7', 'siykNngppHiFrom': '1', 'siykNngppGgTo': 'R', 'siykNngppNenTo': '1', 'siykNngppGatuTo': '7', 'siykNngppHiTo': '2', 'siykTurkKknFlg': '1', 'siykTurkNngppGgFrom': 'R', 'siykTurkNngppNenFrom': '', 'siykTurkNngppGatuFrom': '', 'siykTurkNngppHiFrom': '', 'siykTurkNngppGgTo': 'R', 'siykTurkNngppNenTo': '', 'siykTurkNngppGatuTo': '', 'siykTurkNngppHiTo': '', 'seniMotFlg': '', 'seniGenGamenID': seni_gen_gameen_id } return FormRequest( url=req_url, method='POST', formdata=form_data, callback=self.parse_list_page, headers=self.headers, dont_filter=True )
def select(self, response): yield FormRequest.from_response(response, formdata=self.params, dont_filter=True, callback=self.submit)
def parse_get_email_read(self, response): # 读取邮件(每页20条) print('返回的地址:') print(response.url) # print('返回的内容:') # print(response.body.decode('utf-8')) # 获取页面索引 _page_no_mark = 'var pageNo = ' _pageNo = response.xpath( '/html/head/script[5]').extract_first() pageNo = _pageNo[_pageNo.find(_page_no_mark):_pageNo.find( ';', _pageNo.find(_page_no_mark))].replace(_page_no_mark, '') pageNo = int(pageNo) base_url = "https://mail.263.net/wm2e/mail/mailOperate/mailOperateAction_mailInfo.do" indexNum = 1 # 选中的邮件缓存中索引 for item in range(20): index = item+1 emailIdentity = response.xpath( '//*[@id="contList2"]/ul[{0}]/li[1]/span[2]/input/@value'.format(index)).extract_first() # emailIdentity=response.xpath( # '//*[@id = "contList2"]/ul[{0}]/li[1]/span[2]/input/text()'.format(index)).extract() indexNum = (pageNo-1)*20+index if indexNum > self.emial_num: break params_url = "mailOperateType=read" params_url = params_url+"&emailIdentity="+emailIdentity params_url = params_url+"&selfFolderId=10" params_url = params_url+"&usr="******"&sid="+self.sid params_url = params_url+"&statFlag=2" params_url = params_url+"&starred=0" params_url = params_url+"&waited=0" params_url = params_url+"&floderType=10" params_url = params_url+"&indexNum="+str(indexNum) params_url = params_url+"&reachStoragePoint=true" params_url = params_url+"&undoSend=" params_url = params_url+"&encryptMail=false" params_url = params_url+"&mailPasswdStatus=0" params_url = params_url+"&securityMark=0" params_url = params_url+"&securityType=0" params_url = params_url+"&frameJump=1" url = base_url+'?'+params_url formdata = {} if self.Emailmode == EmailMode.FILED: # 从归档收件箱中获取 formdata = { "pageNo": str(pageNo), "qstr": "", "sortStr": '{"time":"desc"}', "fstr": "{}", "folderId": "10", "type": "10", "fullSearchIfmIsHide": "null", } elif self.Emailmode == EmailMode.QUERY: # 从筛选的收件箱中获取 formdata = { "pageNo": str(pageNo), "qstr": "{ \"ifQuick\" : \"0\" , \"sender\" : \"sales08\" }", "sortStr": "{\"time\":\"desc\"}", "fstr": "{}", "folderId": "", "type": "", "fullSearchIfmIsHide": "null", } else: pass email_list = FormRequest( url, formdata=formdata, callback=self.parse_get_email_info) yield email_list return
def parse_get_email_list_by_emial_filed(self, response): # 从归档的邮件中获取邮件列表信息 # pageNo = str(response.request.body, # 'utf-8').split('&')[0].split('=')[1] # 获取页面索引 _page_no_mark = 'var pageNo = ' _pageNo = response.xpath( '/html/head/script[5]').extract_first() pageNo = _pageNo[_pageNo.find(_page_no_mark):_pageNo.find( ';', _pageNo.find(_page_no_mark))].replace(_page_no_mark, '') pageNo = int(pageNo) _email_num_mark = 'var total =' _emial_num = response.xpath( '/html/head/script[5]').extract_first() emial_num = _emial_num[_emial_num.find(_email_num_mark):_emial_num.find( ';', _emial_num.find(_email_num_mark))].replace(_email_num_mark, '') self.emial_num = int(emial_num) print('查找到邮件封数:%s' % self.emial_num) base_url = "https://mail.263.net/wm2e/mail/mailIndex/mailIndexAction_indexList.do" url = ( base_url+"?usr={0}&sid={1}&12").format(self.user_email, self.sid) for item in range(self.page_num): pageNo = item+1 email_list = FormRequest(url, formdata={ "pageNo": str(pageNo), "qstr": '{ "ifQuick" : "0" , "sender" : "sales08" }', "sortStr": '{"time":"desc"}', "fstr": "{}", "folderId": "", "type": "", "fullSearchIfmIsHide": "null", }, callback=self.parse_get_email_read) yield email_list # base_url = "https://mail.263.net/wm2e/mail/mailOperate/mailOperateAction_mailInfo.do" # indexNum = 1 # 选中的邮件缓存中索引 # for item in range(20): # index = item+1 # emailIdentity = response.xpath( # '//*[@id="contList2"]/ul[{0}]/li[1]/span[2]/input/@value'.format(index)).extract_first() # # emailIdentity=response.xpath( # # '//*[@id = "contList2"]/ul[{0}]/li[1]/span[2]/input/text()'.format(index)).extract() # indexNum = (pageNo-1)*20+index # if indexNum > self.emial_num: # break # params_url = "mailOperateType=read" # params_url = params_url+"&emailIdentity="+emailIdentity # params_url = params_url+"&selfFolderId=10" # params_url = params_url+"&usr="******"&sid="+self.sid # params_url = params_url+"&statFlag=2" # params_url = params_url+"&starred=0" # params_url = params_url+"&waited=0" # params_url = params_url+"&floderType=10" # params_url = params_url+"&indexNum="+str(indexNum) # params_url = params_url+"&reachStoragePoint=true" # params_url = params_url+"&undoSend=" # params_url = params_url+"&encryptMail=false" # params_url = params_url+"&mailPasswdStatus=0" # params_url = params_url+"&securityMark=0" # params_url = params_url+"&securityType=0" # params_url = params_url+"&frameJump=1" # url = base_url+'?'+params_url # email_list = FormRequest(url, formdata={ # "pageNo": str(pageNo), # "qstr": "{}", # "sortStr": "{\"time\":\"desc\"}", # "fstr": "{}", # "folderId": "10", # "type": "10", # "fullSearchIfmIsHide": "null", # }, callback=self.parse_get_email_info) # yield email_list return
def parse(self, response): yield FormRequest.from_response(response, formname='naam', formdata={"naam": artist_name_search}, callback=self.parse1)
def login(self, response): return FormRequest.from_response( response, formdata={'user': input('Username: '******'pwd': getpass.getpass('Password: ')}, callback=self.check_login_response )
def start_requests(self): for i, url in enumerate(self.start_urls): yield FormRequest(url, meta={'cookiejar': i}, \ headers=self.headers, \ cookies=self.cookies, callback=self.parse_item) # jump to login page
def get_details(self, response): if response.xpath( '//script[contains(@src, "https://www.google.com/recaptcha/api")]/@src' ): print(f"\t[{self.Registreringsnummer}] Captcha is found") formdata = { 'key': self.api_key, 'method': 'userrecaptcha', 'googlekey': self.google_key, 'pageurl': self.post_url, } headers = make_headers_1() headers['cookie'] = self.cookie request = FormRequest(url=self.captcha_in_url, method='POST', formdata=formdata, headers=headers, callback=self.get_captcha_id, errback=self.fail_captcha_id, dont_filter=True, meta={}) yield request else: try: Försäkringsbolag = \ [elm.strip() for elm in response.xpath('//strong[contains(text(), "kringsbolag")]/../text()').extract() if elm.strip()][ 0].strip() except: Försäkringsbolag = '' try: Försäkringsdatum = \ [elm.strip() for elm in response.xpath('//strong[contains(text(), "kringsdatum")]/../text()').extract() if elm.strip()][ 0].strip() except: Försäkringsdatum = '' try: Fordonsstatus = \ [elm.strip() for elm in response.xpath('//a[@href="#ts-fordonsstatus"]/../../text()').extract() if elm.strip()][0].strip() except: Fordonsstatus = "" try: Besiktigas_senast_8 = [ elm.strip() for elm in response.xpath( '//strong[contains(text(), "Besiktigas senast")]/../text()' ).extract() if elm.strip() ][-2].strip() except: Besiktigas_senast_8 = "" try: Upplysningar = \ [elm.strip() for elm in response.xpath('//strong[contains(text(), "Upplysningar")]/../text()').extract() if elm.strip()][0].strip() except: Upplysningar = "" try: Import_införsel = \ [elm.strip() for elm in response.xpath('//a[@href="#ts-import"]/../../text()').extract() if elm.strip()][0].strip() except: Import_införsel = "" try: Besiktigas_senast = [ elm.strip() for elm in response.xpath( '//strong[contains(text(), "Besiktigas senast")]/../text()' ).extract() if elm.strip() ][-1].strip() except: Besiktigas_senast = "" try: Senast_godkända_besiktning = [ elm.strip() for elm in response.xpath( '//strong[contains(text(), "Senast god") and contains(text(), "besiktning")]/../text()' ).extract() if elm.strip() ][0].strip() except: Senast_godkända_besiktning = "" try: Mätarställning = \ [elm.strip() for elm in response.xpath('//a[@href="#ts-matarstallning"]/../../text()').extract() if elm.strip()][0].strip() except: Mätarställning = "" item = FuRegnrItem() item['Registreringsnummer'] = self.Registreringsnummer item['Försäkringsbolag'] = Försäkringsbolag item['Försäkringsdatum'] = Försäkringsdatum item['Fordonsstatus'] = Fordonsstatus item['Besiktigas_senast_8'] = Besiktigas_senast_8 item['Upplysningar'] = Upplysningar item['Import_införsel'] = Import_införsel item['Besiktigas_senast'] = Besiktigas_senast item['Senast_godkända_besiktning'] = Senast_godkända_besiktning item['Mätarställning'] = Mätarställning yield item result_row = [ self.Registreringsnummer, Försäkringsbolag, Försäkringsdatum, Fordonsstatus, Besiktigas_senast_8, Upplysningar, Import_införsel, Besiktigas_senast, Senast_godkända_besiktning, Mätarställning ] self.total_cnt += 1 print("\t[Result {}] {}".format(self.total_cnt, result_row)) self.insert_row(result_row=result_row) self.total_scraping_done = True while self.input_data: line = self.input_data.pop() if line: break self.Registreringsnummer = line[18] self.total_cnt += 1 print("[{}] Scanning ...".format(self.Registreringsnummer)) self.total_scraping_done = False request = FormRequest(url=self.get_url, method='GET', headers=make_headers_1(), callback=self.get__RequestVerificationToken, errback=self.fail__RequestVerificationToken, dont_filter=True, meta={}) yield request
def send_cmb_request(url, **kwargs): apk_name = u'招商银行Android客户端' kwargs['apk_name'] = apk_name return FormRequest(url, method='GET', meta=kwargs, callback=get_cmb_detail)
def parse_welcome(self, response): return FormRequest.from_response(response, formdata={ "user": "******", "pass": "******" })
def start_requests(self): key_words = ["python"] for key_word in key_words: url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false" request = FormRequest(url,formdata={"first":"true","pn":"1","kd":key_word},headers=self.headers) yield request
def login(self, response): return FormRequest.from_response(response, formdata={'id': '0000', 'password': '******'}, callback=self.check_login_response)
def parse_review(self, response): if response.status == 200: yield {"packageName": response.meta["package_name"], "review": response.body} yield FormRequest(self.review_url, callback=self.parse_review, formdata={ "id": response.meta["package_name"], "reviewType": '0', "reviewSortOrder": '0', "pageNum": str(response.meta["page"] + 1) }, meta={"package_name": response.meta["package_name"], "page": response.meta["page"] + 1})
def parse(self, response): sel = Selector(response) url = 'http://www.roadrunnersports.com/rrs/product-detail/build-selections.jsp' item = BigCItem() pname = response.xpath("//meta[@property='og:title']/@content").extract()[0] item ["Product_Name"] = response.xpath("//meta[@property='og:title']/@content").extract()[0] if "Trail" in pname : item ["Product_Name"] = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " Running Shoe" mrp = float(sel.xpath("//span[@class='prod_detail_reg_price']/span/text()").extract()[0]) item ["Retail_Price"] = str((mrp*65 + mrp*30/100*70/100*65)*112.5/100 + mrp*65*15/100) item_sp = response.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract() if item_sp: sp = float(sel.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract()[0].split("-")[-1].replace("$","")) item ["Sale_Price"] = str((sp*65 + 30/100*70*65)*112.5/100 + sp*65*15/100) else: item ["Sale_Price"] = '' #categorization cat = response.xpath("//div[@id='grp_1']/p/span[1]/text()") sex = response.xpath("//meta[@property='og:title']/@content").extract()[0] if sex in("Women's"): sex= "Women's" else: sex= "Men's" item["Product_Description"] = response.xpath("//div[@id='grp_1']/p").extract() + response.xpath("//div[@id='grp_1']/ul/li").extract() if cat: # item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/" + sel.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","") cat= ";Shoes/"+sex+" Running Shoes/"+response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","") +" Running Shoes" item ["Product_Name"] = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " " + response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0] + " Running Shoe" else: cat= "" if any("hiking" in s for s in item["Product_Description"]) or any("Hiking" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Hiking Shoes" + cat elif any("trail" in s for s in item["Product_Description"]) or any("Trail" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Trail Running Shoes" + cat elif any("minimalist" in s for s in item["Product_Description"]) or any("barefoot" in s for s in item["Product_Description"]) or any("Barefoot" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Barefoot Running Shoes" + cat elif any("spike" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Racing Spikes" + cat elif any("cross-train" in s for s in item["Product_Description"])or any("trainer" in s for s in item["Product_Description"])or any("training shoe" in s for s in item["Product_Description"]) or any("gym" in s for s in item["Product_Description"]) or any("workout" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Cross Training Shoes" + cat else: if cat: item ["Category"] = "Run & Cycle/Running/Running Shoes"+ cat else: item ["Category"] = "NULL" item ["Brand_Name"] = response.xpath("//span[@itemprop='brand']/text()").extract()[0] if item["Brand_Name"] in ("Asics","Mizuno","Brooks","Saucony","New Balance"): item ["Sort_Order"] = str(-300-(20/100*mrp)) elif item["Brand_Name"] in ("Under Armour","Altra","Hoka One One","Inov8","Salomon","Vibram FiveFingers"): item ["Sort_Order"] = str(-270-(20/100*mrp)) else : item ["Sort_Order"] = str(-250-(20/100*mrp)) item["Product_Availability"] = "12-17 Working Days" item["Current_Stock"] = "100" item ["Free_Shipping"] = "N" item["Product_Image_Is_Thumbnail_1"] = "Y" item["Track_Inventory"] = "By Option" item["Product_Image_Sort_1"] = "1" item["Product_Image_Sort_2"] = "2" item["Product_Image_Sort_3"] = "3" item["Product_Image_Sort_4"] = "4" item["Product_Image_Sort_5"] = "5" item ["imageSetUrls"] = {} item ["imageSetUrls2"] = {} colors = response.xpath("//a[@class='ref2QIColor']/@name").extract() item ["Product_Image_File1"] = {} hrefs = response.xpath("//a[@class='ref2QIColor']/@href").extract() item ["color"] = {} for idx,href in enumerate(hrefs): #create links to image sets if colors[idx] not in item ["imageSetUrls"]: item ["imageSetUrls"][colors[idx]] = [] item ["imageSetUrls"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1") if colors[idx] not in item ["imageSetUrls2"]: item ["imageSetUrls2"][colors[idx]] = [] item ["imageSetUrls2"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1") item ["color"][href.split('/')[-1].split('_')[0].split('-')[1]] = colors[idx] #request product info as json item ["sku"] = response.url.strip('/').split('/')[-2] payload = {'id':item ["sku"]} request = FormRequest(url,formdata=payload,callback=self.parseJsonProduct) request.meta['item'] = item return request
def _parse_online_status(self, response): """ Gets limited_stock and is_out_of_stock fields for product and its variants """ meta = response.meta.copy() reqs = meta.get('reqs') product = meta['product'] data = json.loads( response.body_as_unicode() ) try: product_info = data['products'][0] variants_info = product_info['skus'] variants = product['variants'] final_variants = [] list_out_of_stock = ['70', '80', '85', '87', '90'] list_not_sold_online = ['85', '87', '90'] # Set limited status for main product availability = product_info['availability'] product['is_out_of_stock'] = availability in list_out_of_stock product['is_in_store_only'] = availability in list_not_sold_online if product_info['isLimitedStock']: product['limited_stock'] = True else: product['limited_stock'] = False #Taking right amount of price and availability status try: currency = re.findall('priceCurrency=(.*?),',str(product['price']))[0] except: currency = 'CAD' price = product_info['minCurrentPrice'] if not price: prod_data = [{"productid":response.meta['product_id'], "skus":[{"skuid":str(product['upc']),"storeeligible":True}] }] prod_data = json.dumps(prod_data).replace(' ', '') store_data = json.dumps(['1104','3057','1192','5777']).replace(' ', '') reqs.append(FormRequest( url="http://www.walmart.ca/ws/store/products", formdata={'stores':store_data, 'products':prod_data}, callback=self._parse_store_status, headers={'X-Requested-With': 'XMLHttpRequest'} )) else: product['price'] = Price(priceCurrency=currency, price=price) # Set limited status for product variants if variants: for var in variants_info: sku_id = var['skuId'] availability = var['availability'] variants[sku_id]['is_out_of_stock'] = availability in list_out_of_stock variants[sku_id]['is_in_store_only'] = availability in list_not_sold_online final_variants.append(variants[sku_id]) product['variants'] = final_variants except (KeyError, ValueError): self.log( "Failed to extract limited stock info from %r." % response.url, WARNING ) if reqs: return self.send_next_request(reqs, response) return product
def test_request_class(self): r = FormRequest("http://www.example.com") self._assert_serializes_ok(r, spider=self.spider) r = CustomRequest("http://www.example.com") self._assert_serializes_ok(r, spider=self.spider)
def get_login_page(self, response): """Function to get request options to login. Used to get ReCaptcha token; image captcha value.""" # get the Captcha's options sitekey = response.selector.xpath( "//div[@class='g-recaptcha']/@data-sitekey").get("") imgcaptcha = response.selector.xpath( "//img[@id='imgCaptcha']/@src").get("") img_url = "https://meuveiculo.prefeitura.sp.gov.br" + \ imgcaptcha.replace("..", "") # get cookies to download captcha image cookies = response.headers.getlist('Set-Cookie') c = SimpleCookie() for cookie in cookies: c.load(cookie.decode("utf-8")) cookies_list = [{"name": key, "value": c[key].value} for key in c] # set cookies to current session session = requests.Session() for cookie in cookies_list: # print(cookie) session.cookies.set(**cookie) # save captcha image r = session.get(img_url, stream=True) with open("captcha.jpg", 'wb') as f: f.write(r.content) imgcaptcha_txt, gcaptcha_txt = self.solve_captcha( sitekey, response.url) if not imgcaptcha_txt or not gcaptcha_txt: return # Get options for request EVENTTARGET = response.selector.xpath( "//input[@id='__EVENTTARGET']/@value").get("") EVENTARGUMENT = response.selector.xpath( "//input[@id='__EVENTARGUMENT']/@value").get("") LASTFOCUS = response.selector.xpath( "//input[@id='__LASTFOCUS']/@value").get("") PageProdamSPOnChange = response.selector.xpath( "//input[@id='PageProdamSPOnChange']/@value").get("") PageProdamSPPosicao = response.selector.xpath( "//input[@id='PageProdamSPPosicao']/@value").get("") PageProdamSPFocado = response.selector.xpath( "//input[@id='PageProdamSPFocado']/@value").get("") VIEWSTATE = response.selector.xpath( "//input[@id='__VIEWSTATE']/@value").get("") VIEWSTATEGENERATOR = response.selector.xpath( "//input[@id='__VIEWSTATEGENERATOR']/@value").get("") EVENTVALIDATION = response.selector.xpath( "//input[@id='__EVENTVALIDATION']/@value").get("") tpAudio = response.selector.xpath( "//input[@id='__tpAudio']/@value").get("") strVal = response.selector.xpath("//input[@id='__strVal']/@value").get( "") frm_data = { '__EVENTTARGET': EVENTTARGET, '__EVENTARGUMENT': EVENTARGUMENT, '__LASTFOCUS': LASTFOCUS, 'PageProdamSPOnChange': PageProdamSPOnChange, 'PageProdamSPPosicao': PageProdamSPPosicao, 'PageProdamSPFocado': PageProdamSPFocado, '__VIEWSTATE': VIEWSTATE, '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR, '__EVENTVALIDATION': EVENTVALIDATION, 'txtRenavam': self.renavam, 'txtplaca': self.placa, '__tpAudio': tpAudio, '__strVal': strVal, 'txtValidacao': imgcaptcha_txt, 'g-recaptcha-response': gcaptcha_txt, 'btnMultas': 'Consultar' } login_url = "https://meuveiculo.prefeitura.sp.gov.br/forms/frmPesquisarRenavam.aspx" yield FormRequest(url=login_url, formdata=frm_data, callback=self.login_me, errback=self.errback_func, dont_filter=True)
def parse(self, response): item = BilibiliItem() content = json.loads(response.body) data = content['data'] try: item['status'] = content['status'] if 'status' in data.keys( ) else 'False' item['mid'] = data['mid'] item['name'] = data['name'] item['sex'] = data['sex'] item['rank'] = data['rank'] item['face'] = data['face'] try: item['regtime'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(data['regtime'])) except: item['regtime'] = 'miss' item['spacesta'] = data['spacesta'] item['birthday'] = data['birthday'] if 'birthday' in data.keys( ) else 'miss' item['sign'] = data['sign'] item['level'] = data['level_info']['current_level'] item['officialverify_type'] = data['official_verify']['type'] item['officialverify_desc'] = data['official_verify']['desc'] item['viptype'] = data['vip']['vipType'] item['vipstatus'] = data['vip']['vipStatus'] item['toutu'] = data['toutu'] item['toutuid'] = data['toutuId'] item['coins'] = data['coins'] print('successful1 get userinfo:' + str(data['mid'])) except Exception as e: print('error1:', item['mid'], e) try: header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'sid=kh3nfx8z; UM_distinctid=160f901ca0540-077f672036c946-3c604504-130980-160f901ca061d; buvid3=18869AAC-BC92-43DF-8929-333117E24C5231000infoc; fts=1516006134; LIVE_BUVID=AUTO5115160061339503; pgv_pvi=7086867456; rpdid=oqxiqwmllodosomkwxoxw; finger=edc6ecda', 'Host': 'api.bilibili.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } url1 = 'https://api.bilibili.com/x/relation/stat?vmid=%d&jsonp=jsonp' % int( data['mid']) content1 = json.loads(requests.get(url=url1, headers=header).text) item['following'] = content1['data']['following'] item['follower'] = content1['data']['follower'] url2 = 'https://api.bilibili.com/x/space/upstat?mid=%d&jsonp=jsonp' % int( data['mid']) content2 = json.loads(requests.get(url=url2, headers=header).text) item['archiveview'] = content2['data']['archive']['view'] item['article'] = content2['data']['article']['view'] print('successful2 get userinfo:' + str(data['mid'])) except Exception as e: item['following'] = 0 item['follower'] = 0 item['archiveview'] = 0 item['article'] = 0 print('miss2:', item['mid'], e) yield item for i in range(2, 100): form_data = { 'mid': str(i), 'csrf': '', } yield FormRequest(url=self.url, callback=self.parse, headers=self.head, formdata=form_data)
def submit(self, response): yield FormRequest.from_response(response, formdata={'btnSubmit', '选课提交'}, dont_filter=True)