def parse(self, response): html = response.body tree = build_div_tree(html) soup = tree.get_root().soup_get() write2file = self.write2file divs = [] #@ carhome # __navigation__ = 0 leaf = soup.find('div', class_='cartree') if leaf is not None: write2file(leaf, 'cartree', __navigation__) for a in leaf.find_all('a'): url = a['href'] url = urljoin(self.base_url, url) yield scrapys.SplashRequest(url, callback=self.parse, args={'wait': 0.5}) divs.append(leaf) leaf = soup.find('div', class_='uibox-con') if leaf is not None: write2file(leaf, 'uibox-con', __navigation__) for a in leaf.find_all('a'): url = a['href'] url = urljoin(self.base_url, url) yield scrapys.SplashRequest(url, callback=self.parse, args={'wait': 0.5}) divs.append(leaf) leaf = soup.find('div', class_='uibox-con-search') if leaf is not None: write2file(leaf, 'uibox-con-search', __navigation__) for a in leaf.find_all('a'): url = a['href'] url = urljoin(self.base_url, url) yield scrapys.SplashRequest(url, callback=self.parse, args={'wait': 0.5}) divs.append(leaf) ### __picture__ = 1 # 转换思路?先全部标为0,之后再直接搜img然后对图片标1,读取的时候注意一下列就行了 leaf = soup.find('div', class_='main') if leaf is not None: write2file(leaf, 'main', __picture__) divs.append(leaf) ### __header_nav__ = 2 leaf = soup.find('div', class_='header-nav') if leaf is not None: write2file(leaf, 'header-nav', __header_nav__) divs.append(leaf) # 这个推入的是引用但是不需要做深拷贝因为leaf重新赋值之后id会改变 ### __other__ = 3 for leaf in soup.find_all('div', lambda d: d not in divs): write2file(leaf, 'other', __other__)
def after_login(self, response): """ Makes first requests to get movements after sucessful login """ if "error" in response.url: msg = "Failed login" logger.warn(msg) raise scrapy.exceptions.CloseSpider(reason=msg) last_movement_date = Movement.get_last_date() # If no Movement has ever been parsed set start date to June 2015 else # sets it to some days before the last movement date if last_movement_date == date.min: last_movement_date = date(2015, 6, 1) else: last_movement_date = last_movement_date - timedelta(days=3) # Starts parsing of losses losses_request = scrapy_splash.SplashRequest( url=self.LOSSES_URL, callback=self.parse_movements, endpoint="execute", cache_args=["lua_source"], dont_filter=True, args={ "lua_source": self.movements_lua, "moneymap_url": self.MONEYMAP_URL, "meseanno": last_movement_date.strftime("%m%Y"), "dopoAggiornamento": "false", "idBrand": "", }, meta={"date": last_movement_date}, ) # Starts parsing of revenues revenues_request = scrapy_splash.SplashRequest( url=self.REVENUES_URL, callback=self.parse_movements, endpoint="execute", cache_args=["lua_source"], dont_filter=True, args={ "lua_source": self.movements_lua, "moneymap_url": self.MONEYMAP_URL, "meseanno": last_movement_date.strftime("%m%Y"), "dopoAggiornamento": "false", "idBrand": "", }, meta={"date": last_movement_date}, ) revenues_request.meta["date"] = last_movement_date return [losses_request, revenues_request]
def new_request(self, url, depth, retry, pagelink, country, territory, retries, links=None): """Return a new request object.""" request = scrapy_splash.SplashRequest(url=self.get_next_page( url, pagelink), callback=self.parse, endpoint='execute', args={ 'lua_source': self.lua_script, 'timeout': 90 }) request.meta['depth'] = depth request.dont_filter = retry request.meta['pagelink'] = pagelink request.meta['country'] = country request.meta['territory'] = territory request.meta['retries'] = retries return request
def parse(self, response): # Store CSV file filename = 'companies.csv' with open(filename, 'wb') as f: f.write(response.body) self.log('Saved file %s' % filename) # Read CSV file with open(filename) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: line_count += 1 else: # for test only, process company code with TLS if len(row) == 3: if len(row[1]) == 3: if line_count >= self.limit: pass else: self.log('COMPANY ' + row[1]) request = scrapy_splash.SplashRequest( url= 'https://www.asx.com.au/asx/share-price-research/company/' + row[1], callback=self.parse_price, args={ "wait": 10, 'timeout': 1800, 'images': 0, }) request.meta['code'] = row[1] line_count += 1 yield request
def start_requests(self): urls = [ self.facebook_base_url + f'/results/?q&content_types[0]=publication&sort_by=relevance&view=list&page={i}' for i in range(1, 60) ] lua_script = """ function main(splash, args) splash.private_mode_enabled = false assert(splash:go(args.url)) assert(splash:wait(10.0)) return { html = splash:html(), png = splash:png() } end """ for url in urls: yield scrapy_splash.SplashRequest(url=url, callback=self.parse, endpoint='execute', args={ 'lua_source': lua_script, 'html': 1, 'wait': 30 })
def parse(self, response): for item in response.css('div.imgbox'): href = item.css('a::attr(href)').extract_first() href = urljoin(self.base_url, href) yield scrapys.SplashRequest(href, callback=self.img_parse, args={'wait': 0.5})
def parse_word(self, response): #with open('word_page.html', 'w+b') as f: # f.write(response.body) # get json from html json_text = response.xpath('//pre/text()').extract()[0] # get words data_dict = json.loads(json_text) total = data_dict['total'] page = data_dict['page'] # check if next page exist if page < total: url = urlunparse( ('http', 'dict.eudic.net', '/StudyList/GridData', '', 'catid=&_search=false&rows=50&page={}&sidx=&sord=asc'.format( page + 1), '')) yield scrapy_splash.SplashRequest( url=url, callback=self.parse_word, headers=self.headers_word, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script}, meta={'cookiejar': response.meta['cookiejar']}) # save word for word_dict in data_dict['rows']: word = EudicWordsItem() word['word'] = word_dict['id'] yield word
def start_requests(self): self.link_server = redis.StrictRedis(host='127.0.0.1', port=6379, db=1) self.img_server = redis.StrictRedis(host='127.0.0.1', port=6379, db=2) # 读取数据建立模型 data = np.loadtxt('classify_0.data', dtype=float, delimiter=',', usecols=(0, 1, 2, 3, 4)) x, y = np.split(data, (4,), axis=1) # 参数设置 for i in range(1, 4): # 是否跳过other项它不好分辨,other在classify_3里面 data = np.loadtxt('classify_{}.data'.format( i), dtype=float, delimiter=',', usecols=(0, 1, 2, 3, 4)) if len(data) > 150: data = data[0:150, :] x_temp, y_temp = np.split(data, (4,), axis=1) # 参数设置 x = np.vstack((x, x_temp)) y = np.vstack((y, y_temp)) x_train, x_test, y_train, y_test = train_test_split( x, y, random_state=1, train_size=0.8) # svm # self.model = svm.SVC(kernel='rbf') # decision tree # self.model = tree.DecisionTreeClassifier() # knn # self.model = neighbors.KNeighborsClassifier() # bayes self.model = naive_bayes.MultinomialNB() # MLP # self.model = neural_network.MLPClassifier( # solver='lbfgs', activation='tanh') # to fit the model self.model.fit(x_train, y_train.ravel()) for item in self.start_urls: yield scrapys.SplashRequest( item, callback=self.parse, args={'wait': 0.5})
def parse_colleges(self, response): #-------------------------------------------# startUrl = "https://sports.usatoday.com/ncaa/salaries/" #basketball #startUrl = "https://sports.usatoday.com/ncaa/salaries/mens-basketball/coach/" #-------------------------------------------# # next step is to simply create a dict that maps all_them = "/html/body/div[6]/div[4]/div[2]/div[1]/div/div[1]/div/section/div[2]/table/tbody/tr/td[@class='']" lol = response.selector.xpath(all_them).extract() count = 0 # click every second out of 4 to_click = True for one in lol: count += 1 #print(str(count)+" " + one) if (count % 2 == 0 and to_click): # request and click on button to_click = False # print(str(count)+" " + one) if (count % 4 == 0): to_click = True # count is now total number of colleges , I need 1-count #total_num = 131 total_num = 81 #basketball for one in range(1, total_num): self.listofcolleges.add(one) while (len(self.listofcolleges) != 0): counter = self.listofcolleges.pop() button = self.return_css_lol(counter) time.sleep(3) LUA_SCRIPT = """ function main(splash) assert(splash:go(splash.args.url)) local element = splash:select('%s') local bounds = element:bounds() assert(element:mouse_click{x=bounds.width/3, y=bounds.height/3}) assert(splash:wait(5)) return splash:html() end """ % (button) #print(button) # print(LUA_SCRIPT) SCRAPY_ARGS = {'lua_source': LUA_SCRIPT} the_request = scrapy_splash.SplashRequest( url=startUrl, callback=self.parse_college, endpoint='execute', args=SCRAPY_ARGS) the_request.meta['counter'] = counter yield the_request
def start_requests(self): for beerId in list( filter(lambda x: x not in self.ids_seen, range(200, 300))): yield scrapy_splash.SplashRequest( self.start_url + str(beerId) + '/', self.parse, args={'lua_source': self.lua_script}, meta={'id': beerId})
def start_requests(self): for i in range(len(self.start_urls)): yield scrapy_splash.SplashRequest(url=self.start_urls[i], callback=self.collect_data, dont_filter=True, endpoint='render.html', args={'wait': 4}, meta={'url': self.start_urls[i]})
def start_requests(self): """ Handle start_requests directly so we can explicitly return SplashRequest objects with a three second wait. """ for url in self.start_urls: yield scrapy_splash.SplashRequest(url, self.parse, endpoint="execute", args={"lua_source": lua_script})
def request(self, url, domain): return scrapy_splash.SplashRequest( url, endpoint='render.json', args={ 'png': 1, 'html': 1 }, callback=partial(self.parse, domain=domain), )
def start_requests(self): """ initial requests for the crawler :return: """ urls = self.build_urls() for url in urls: yield scrapy_splash.SplashRequest(url=url, callback=self.parse_zone)
def start_requests(self): self.web_parameter = { 'link_num': 0, 'img_num': 0, 'content_length': 0, 'tag_num': 0, } for item in self.start_urls: yield scrapys.SplashRequest(item, callback=self.parse, args={'wait': 0.5})
def parse(self, response): for item in response.css('a.tsla-header-nav--list_link'): url = item.css('::attr(href)').extract_first() next_page = response.urljoin(url) key = url.split('/')[-1] if key in self.callbacks: request = scrapys.SplashRequest( next_page, callback=self.callbacks[key], args={'wait': 0.5}) request.meta['key'] = key yield request else: yield None
def img_parse(self, response): # 点击各个图片 for item in response.css('.uibox-con ul li'): url = item.css('a::attr(href)').extract_first() url = urljoin(self.base_url, url) request = scrapys.SplashRequest(url, callback=self.return_item, args={'wait': 0.5}) request.meta['brand'] = response.meta['brand'] request.meta['series'] = response.meta['series'] request.meta['kind'] = response.meta['kind'] yield request
def start_requests(self): urls = ['https://research.yandex.com/publications'] for url in urls: yield scrapy_splash.SplashRequest(url=url, callback=self.parse, endpoint='/execute', args={ 'html': 1, 'lua_source': self.lua_script, 'wait': 30, })
def brand_parse(self, response): # 车系选项 for item in response.css('.cartree ul li.current dl dd a'): # for item in response.css('#series_2368'): # 调试用,爬取开上面 url = item.css('::attr(href)').extract_first() url = urljoin(self.base_url, url) series = item.css('::text').extract_first() request = scrapys.SplashRequest(url, callback=self.series_parse, args={'wait': 0.5}) request.meta['brand'] = response.meta['brand'] request.meta['series'] = series.strip() yield request
def kind_parse(self, response): # 选择外观选项 for item in response.css('div.search-pic li'): if item.css('::text').extract_first() == '车身外观': url = item.css('a::attr(href)').extract_first() url = urljoin(self.base_url, url) request = scrapys.SplashRequest(url, callback=self.img_parse, args={'wait': 0.5}) request.meta['brand'] = response.meta['brand'] request.meta['series'] = response.meta['series'] request.meta['kind'] = response.meta['kind'] yield request
def start_requests(self): urls = [ 'https://www.instagram.com/explore/tags/fashion/', ] for url in urls: yield scrapy_splash.SplashRequest( url, self.parse, endpoint='execute', args={ 'wait': 2, 'lua_source': script2 }, )
def parse(self, response): pages = response.xpath("//a[re:test(@id, '\d$')]") for page_aid in pages: page_id = page_aid.xpath("text()").extract_first() req_url = self.start_urls[0] + page_aid.xpath('@href').extract()[0] request = scrapy_splash.SplashRequest( req_url, self.parse_images, args={'wait': 2}, slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN, ) request.meta['page'] = int(page_id) request.meta['n_pages'] = len(pages) yield request
def start_requests(self): # start_urls = ['https://item.jd.com/6946605.html'] # 编码为'gbk' start_urls = ['https://music.163.com'] change_url = 'https://music.163.com/artist?id=3684' args = { 'html': 1, 'png': 1, 'wait': 0.5, } for url in start_urls: # 使用scrapy-splash的方法一 yield scrapy_splash.SplashRequest(url, self.parse, args={'wait': 2.5}, splash_url=change_url)
def start_requests(self): if (len(self.still_need) == 0): print("All good I'm done") quit() #-----------------------------------------# startUrl = "https://sports.usatoday.com/ncaa/salaries/" #basketball #startUrl = "https://sports.usatoday.com/ncaa/salaries/mens-basketball/coach/" #-----------------------------------------# yield scrapy_splash.SplashRequest(url=startUrl, callback=self.parse_colleges, args={'wait': 0.5})
def parse(self, response): # 品牌选项 for item in response.css('.cartree ul li a'): # for item in response.css('#b134'): # 调试用,爬取开上面 url = item.css('::attr(href)').extract_first() url = urljoin(self.base_url, url) if self.bf.isContains(url): continue else: self.bf.insert(url) brand = item.css('::text').extract_first() request = scrapys.SplashRequest(url, callback=self.brand_parse, args={'wait': 0.5}) request.meta['brand'] = brand.strip() yield request
def series_parse(self, response): # 选择车型选项 for item in response.css('div.search-pic dl'): urls = item.css('dd a::attr(href)').extract() kinds = item.css('dd a::text').extract() years = item.css('dt::text').extract() for url, year, kind in zip(urls, years, kinds): url = urljoin(self.base_url, url) kind = year + kind request = scrapys.SplashRequest(url, callback=self.kind_parse, args={'wait': 0.5}) request.meta['brand'] = response.meta['brand'] request.meta['series'] = response.meta['series'] request.meta['kind'] = kind.strip() yield request
def start_requests(self): url = 'http://www.inveno.cn/' yield scrapy_splash.SplashRequest( url, self.parse_result, args={ # optional; parameters passed to Splash HTTP API 'wait': 0.5, # 'url' is prefilled from request url # 'http_method' is set to 'POST' for POST requests # 'body' is set to request body for POST requests }, splash_url= 'http://192.168.1.125:8050/', # optional; overrides SPLASH_URL slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN, # optional )
def after_login(self, response): #with open('login_page.html', 'w+b') as f: # f.write(response.body) # http://dict.eudic.net/StudyList/GridData?catid=&_search=false&rows=5&page=1&sidx=&sord=asc url = urlunparse( ('http', 'dict.eudic.net', '/StudyList/GridData', '', 'catid=&_search=false&rows=50&page=1&sidx=&sord=asc', '')) return scrapy_splash.SplashRequest( url=url, callback=self.parse_word, headers=self.headers_word, endpoint='execute', cache_args=['lua_source'], args={'lua_source': script}, meta={'cookiejar': response.meta['cookiejar']})
def parse_price(self, response): dividens = response.xpath( '//td[@class="overview-dividends"]/table//tr') json = {} json[response.meta['code']] = { "summary": { "Summary_Value": response.xpath('//span[@ng-show="share.last_price"]/text()' ).extract_first(), "market_cap": response.xpath( '//div[@ng-switch="share.market_cap"]/span/text()'). extract_first(), "dividens": { "most_recent": dividens[0].xpath('td[2]/span//text()').extract_first(), "Dividend ex-date": dividens[1].xpath('td[2]//text()').extract_first(), "Dividend pay date": dividens[2].xpath('td[2]//text()').extract_first(), "Franking": dividens[3].xpath('td[2]//text()').extract_first(), "Annual dividend yield": dividens[4].xpath('td[2]/span//text()').extract_first(), } } } request = scrapy_splash.SplashRequest( url='https://www.asx.com.au/asx/share-price-research/company/' + response.meta['code'] + '/statistics/shares', callback=self.parse_statistic, args={ # optional; parameters passed to Splash HTTP API 'timeout': 1800, "wait": 10, 'images': 0, # 'url' is prefilled from request url # 'http_method' is set to 'POST' for POST requests # 'body' is set to request body for POST requests }, ) request.meta['json'] = json request.meta['code'] = response.meta['code'] yield request
def parse_page(self, response): content_selector = content = response.css( 'div#content div.wiki-content') title = response.css('#title-text a::text')[0].extract() bread_crumbs = response.css('ol#breadcrumbs a::text').extract() content = content_selector[0].extract() path = '' for bread_crumb in bread_crumbs: path = path + bread_crumb + '/' # 图片信息 content = content.replace('&', '&') imgs = content_selector.css('img') i = 1 for img in imgs: src = img.css('::attr(src)')[0].extract() img_name = title + str(i) + '.png' # content = content.decode('utf-8').replace(src.decode('utf-8'), img_name).encode('utf-8') content = content.replace(src, img_name) i += 1 img_url = self.base_url + src yield scrapy_splash.SplashRequest( url=img_url, callback=self.parse_img, args={ 'wait': 0.1, 'lua_source': script_img, }, meta={ 'img_name': img_name, 'path': path }, endpoint='execute', # optional; default is render.html splash_url= 'http://127.0.0.1:8050', # optional; overrides SPLASH_URL slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN, # optional ) item = ConfluenceItem() item['name'] = title item['path'] = path item['content'] = content yield item