def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sites = response.xpath('//a[@href]') print "sites",sites items = [] for site in sites: item = Website() item['name'] = site.xpath('@alt').extract() item['url'] = site.xpath('@href').extract() # print item['url'] # item['url'] = site.xpath( # 'a/@href').extract_first().strip() item['description'] = site.xpath('text()').extract() for i in item['description']: print i items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) # sites = sel.xpath('//ul[@class="directory-url"]/li') sites = sel.xpath(self.rules.filters['root']['0']) items = [] for site in sites: item = Website() # item['name'] = site.xpath('a/text()').extract() item['quarks_title'] = site.xpath( self.rules.filters['title']['0']).extract() item['quarks_link'] = site.xpath( self.rules.filters['link']['0']).extract() item['quarks_description'] = site.xpath(self.rules.filters['description']['0'])\ .re(self.rules.filters['description']['1']) # item['quarks_pubdate'] = site.xpath(self.rules.filters['pubDate']['0']) items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') items.append(item) # Original script had the the below line "return items". return items # However, I included the below item to write to a csv. saveFile = open('crawl_data.txt', 'w') saveFile.write(str(items)) saveFile.close()
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ #sel = Selector(response) #sites = sel.xpath('//ul[@class="directory-url"]/li') sites = response.css( '#site-list-content > div.site-item > div.title-and-desc') items = [] for site in sites: item = Website() item['name'] = site.css( 'a > div.site-title::text').extract_first().strip() item['url'] = site.xpath('a/@href').extract_first().strip() item['description'] = site.css( 'div.site-descr::text').extract_first().strip() items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html Use the `check` command to run the contract checks. @url http://dmoztools.net/Computers/Programming/Languages/Python/Resources/ @scrapes name """ nodes = response.xpath( '//div[contains(@class, "site-item")]/div[contains(@class, "title-and-desc")]' ) items = [] for node in nodes: item = Website() item['name'] = node.xpath( 'a/div[contains(@class, "site-title")]/text()').re_first( '^[\s\r\n]*(.*[^\s])[\s\r\n]*') item['url'] = node.xpath('a/@href').extract_first() item['description'] = node.xpath( 'div[contains(@class, "site-descr")]/text()').re_first( '^[\s\r\n]*(.*)[\s\r\n]*') items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] global page_request, articl_request item = Website() item['articl_url'] = self.parse_articl_url(response) page_url = self.parse_page_url(response) item['page_url'] = [] if page_url != 0: item['page_url'].append(page_url) item['content'] = [] item['articl_name'] = [] #print "articl url:%s\nlen:%d\n" %(item['articl_url'],len(item['articl_url'])) for t_count in range(len(item['articl_url'])): articl_url = item['articl_url'][t_count] print "##################requse url = %s\n" % (articl_url) articl_request = Request(articl_url, callback=self.parse_articl_content) articl_request.meta['item'] = item yield articl_request if t_count == len(item['articl_url']) - 1: if item['page_url']: #print '$$$$$$$$$$$$$$$$$$$$$$$$page url %s\n' %(item['page_url'][0]) page_request = Request(item['page_url'][0], callback=self.parse) yield page_request else: yield 'go to the last page, done!!!\n'
def parse_items(self, response): #print response sel = Selector(response) sites = sel.xpath('//ul/li') if response.status in [404,500,303]: raise CloseSpider("Met the page which doesn't exist") url= response.request.meta['url'] print "ss" print url items = [] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') if(len(item['url']) != 0): if(len(str(item['url'][0])) != 1): new_url = str(self.sta[0])+str(item['url'][0]); yield Request(new_url, meta={'item':item},callback=self.parse_items) self.dic[url]=items; items.append(item) yield self.collect_item(self.dic)
def parse_dir_contents(self, response): genre_text = response.xpath( "//body//div[@id='mw-pages']//h2//span//text()").extract()[0] genre = genre_text.split("Genre/")[-1].strip('"') artists = response.xpath( '//body//div[@id="mw-pages"]//div[@class="mw-content-ltr"]') artists = artists.xpath('//tr//ul//li//a') for sel in artists: item = Website() url = sel.xpath('@href').extract()[0] url = response.urljoin(url) title = sel.xpath('@title').extract()[0] item['url'] = url item['title'] = title item['genre'] = genre yield item
def parse1(self, response): print response sel = Selector(response) sites = sel.xpath('//ul/li') items = [] urls = [] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') new_url = str(self.start_urls[0]) + str(item['url'][0]) yield Request(new_url, meta={ 'item': item, 'url': new_url }, callback=self.parse_items) #print item['name'] ,item['url'] ,item['description'] items.append(item) self.dic[str(self.start_urls[0])] = items print self.dic print "result"
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//ul/li') items = [] for site in sites: item = Website() item['name'] = site.select('a/text()').extract() item['url'] = site.select('a/@href').extract() item['description'] = site.select('text()').extract() items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) # sites = sel.xpath('//ul[@class="directory-url"]/li') # sites = sel.xpath(self.rules.filters['root']['0']) bodies = sel.xpath(self.rules.filters['body']['0']) items = [] # for body in bodies: body = bodies for body in bodies: item = Website() # item['name'] = site.xpath('a/text()').extract() # jd_root_nav item['jddj_root_nav'] = body.xpath( self.rules.filters['root_nav']['0']).extract() ## this two need not to two variables, because 'jd_root_nav' is a array #item['jd_fenlie'] = body.xpath(self.rules.filters['fenlie']['0']).extract() #item['jd_fenglie2'] =body.xpath(self.rules.filters['fenglie2']['0']).extract() # jd_product_intro # item['jd_product_intro'] = body.xpath(self.rules.filters['product_intro']['0']) item['jddj_spec_n1'] = body.xpath( self.rules.filters['spec_n1']['0']).extract() ## this rules'jd_p_ad' is right, but cannot select. I don't know why item['jddj_p_ad'] = body.xpath( self.rules.filters['p_ad']['0']).extract() ## this rules'jd_jd_price' is right, but cannot select. I don't know why item['jddj_jd_price'] = body.xpath( self.rules.filters['jd_price']['0']).extract() #item['jd_product_detail_1'] = body.xpath(self.rules.filters['product_detail_1']['0']) #item['jd_parameter2'] = body.xpath(self.rules.filters['parameter2']['0']) item['jddj_canshu'] = body.xpath( self.rules.filters['canshu']['0']).extract() # jd_promises #item['jd_promises'] = body.xpath(self.rules.filters['promises']['0']) #item['jd_zhengpin'] = body.xpath(self.rules.filters['zhengpin']['0']).extract() # jd_comment, this is comment list #item['jd_comment'] = body.xpath(self.rules.filters['comment']['0']) # add to list items items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') items.append(item) item.self_print() return items
def parse_single(self, response): w = Website() path = response.url.split('/') number = path[-1].split('.')[0].lstrip('p0') author = path[-2] image_base = path[:-1] hxs = Selector(response) image = hxs.xpath('//img/@src').extract()[0] image_base.append(image) w['number'] = number w['author'] = author w['image_urls'] = ['/'.join(image_base)] return w
def parse(self, response): sel = Selector(response) #sites = sel.xpath('//ul/li') sites = response.css('a') items = [] for site in sites: item = Website() item['name'] = site.xpath('@title').extract() item['url'] = site.xpath('@href').extract() items.append(item) return items
def parse(self, response): item=None print("===============") """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ if 'item' in response.meta: item = response.meta['item'] if item is None: item = Website() item['url'] = self.start_urls[0] item['depth']=0 hxs = Selector(response) hrefs = hxs.xpath("/html/body//@href").extract() item['description'] = '测试' item['title']= 'title' yield item for a in hrefs: item_detail=Website() item_detail['depth'] = item['depth'] + 1 item_detail['url'] = a if item_detail['depth'] > self.depth : return else: if a.startswith('http') and a.find('python') >= 0: yield scrapy.Request(url=a, meta={'item': item_detail}, callback=self.parse, dont_filter=True) pass pass pass
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sites = response.selector.xpath('//h3') items = [] for site in sites: item = Website() item['name'] = site.selector.xpath('//p').extract_first().strip() print(items)
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Website() item['name'] = site.xpath('title/text()').extract() items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ hxs = HtmlXPathSelector(response) sites = hxs.select('//div[@id="main-content"]//td') items = [] for site in sites: item = Website() item['url'] = site.select('a/@href').extract() items.append(item) #dict(item) return items
def parse_item(self, response): p = Pinyin() items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = Website() item['name'] = site.css('.l.square a').xpath('text()').extract()[0] item['description'] = site.css( 'tr > td:nth-child(2)::text').extract()[0] url = site.css('tr > td:nth-child(4)::text').extract()[0] item['url'] = p.get_pinyin(url, u'') item['address'] = url item['num'] = int( site.css('tr > td:nth-child(3)::text').extract()[0]) item['date'] = site.css('tr > td:nth-child(5)::text').extract()[0] item['uid'] = item['date'] + '-' + url + '-' + item['name'] items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ hxs = HtmlXPathSelector(response) sites = hxs.select('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Website() item['name'] = site.select('a/text()').extract() item['url'] = site.select('a/@href').extract() item['description'] = site.select('text()').re('-\s([^\n]*?)\\n') items.append(item) return items
def parse(self, response): sel = Selector(response) sites = sel.xpath('//table/tbody') for site in sites: item = Website() item['program_name'] = site.select( './tr[1]/td[3]/a/text()').extract() item['license_no'] = site.select( './tr[1]/td[5]/a/text()').extract() item['area'] = site.select('./tr[2]/td[2]/text()').extract() item['open_time'] = site.select('./tr[2]/td[4]/text()').extract() item['program_type'] = site.select( './tr[3]/td[2]/text()').extract() item['sale_phone_no'] = site.select( './tr[3]/td[4]/text()').extract() item['program_addr'] = site.select( './tr[4]/td[2]/text()').extract() yield item sm = send_email() sm.send_email()
def parse_items(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ #print response sel = Selector(response) sites = sel.xpath('//ul/li') """print "sunny" print sites print "prakash" """ if response.status in [404, 500, 303]: raise CloseSpider("Met the page which doesn't exist") url = response.request.meta['url'] print "ss" print url #urls=[] items = [] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') new_url = str(self.start_urls[0]) + str(item['url'][0]) #print new_url; #urls.append(new_url); yield Request(new_url, meta={'item': item}, callback=self.parse_items) #print item['name'] ,item['url'] ,item['description'] items.append(item) self.dic[url] = items print "final" print self.dic
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) sites = sel.xpath('//div[@class="title-and-desc"]') items = [] for site in sites: item = Website() item['name'] = site.xpath('a/div/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath( 'div[@class="site-descr "]/text()').extract() items.append(item) return items
def parse_category(self, response): # The main selector we're using to extract data from the page main_selector = HtmlXPathSelector(response) # The XPath to website links in the directory page xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font' # Get a list of (sub) selectors to each website node pointed by the XPath sub_selectors = main_selector.select(xpath) # Iterate over the sub-selectors to extract data for each website for selector in sub_selectors: item = Website() l = XPathItemLoader(item=item, selector=selector) l.add_xpath('name', 'a/text()') l.add_xpath('url', 'a/@href') l.add_xpath('description', 'font[2]/text()') # Here we populate the item and yield it yield l.load_item()
def parse_dir_contents(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sites = response.xpath( '//table[@id="ctl00_cphMain_gridResult"]/tr/td[3]') # sites = response.css('#site-list-content > div.site-item > div.title-and-desc') items = [] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.css('a/@target').extract() items.append(item) return items
def parse1(self, response): sel = Selector(response) sites = sel.xpath('//ul/li') sites1=sel.xpath('//a/@href').extract(); print sites print "ssssssss" print sites1 items = [] urls=[] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') #yield self.collect_item(item) if(len(item['url']) != 0): if(len(str(item['url'][0])) != 1): new_url = str(self.sta[0])+str(item['url'][0]) yield Request(new_url, meta={'item':item,'url':new_url},callback=self.parse_items) items.append(item) yield self.collect_item(item) self.dic[str(self.start_urls[0])]=items;
def parse1(self, response): print response sel = Selector(response) texts=sel.xpath("//input[@type='text']") print "ttttttt" print texts print "sssssss" sites = sel.xpath("//ul/li[@onclick]") print sites items = [] urls=[] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') yield self.collect_item(item) if(len(item['url']) != 0): if(len(str(item['url'][0])) != 1): new_url = str(self.sta[0])+str(item['url'][0]) print "new_url :"+new_url yield Request(new_url, meta={'item':item,'url':new_url},callback=self.parse_items) items.append(item) self.dic[str(self.start_urls[0])]=items;
def parse_page(self, response): for ext in self.filetypes: if (ext[1:] in response.headers['Content-Type'].upper() or ('Content-Disposition' in response.headers and ext in response.headers['Content-Disposition'].upper())): print "Detected a downloadable, generated file" item = Website() item['URL_Datei'] = response.url item['Stadt_URL'] = unicode(self.domain, 'utf-8') #Not applicable item['URL_Text'] = unicode('', 'utf-8') if ('Content-Disposition' in response.headers): item['URL_Dateiname'] = unicode( response.headers['Content-Disposition'], 'utf-8') else: item['URL_Dateiname'] = unicode( item['URL_Datei']).split('/')[-1] item['Format'] = ext[1:] #if we just have e.g. "json" and we are dealing with DKAN, then we are probably dealing with an API item description and not a file if (item['URL_Dateiname'].upper() == item['Format']) and 'node' in item['URL_Datei']: return [] if (ext in self.geofiletypes): item['geo'] = 'x' else: item['geo'] = u'' item[ 'URL_PARENT'] = u'Nicht moeglich kann aber nachtraeglich ermittelt werden' item[ 'Title_PARENT'] = u'Nicht moeglich kann aber nachtraeglich ermittelt werden' self.writerdata.writerow(item) #Done return [] if ('Content-Type' in response.headers and 'text/html' not in response.headers['Content-Type']): print "Not HTML or anything else of interest, giving up" print response.headers return [] #Otherwise, its html and we process all links on the page sel = Selector(response) #Title of the page we are on (this will be the 'parent') parent_title = sel.xpath('//title/text()').extract() if (len(parent_title) > 0): parent_title = parent_title[0] #URL of the page we are on (parent) parent_url = response.url #Get all links sites = sel.xpath('//body//a') #items = [] for site in sites: item = Website() item['URL_Datei'] = unicode('', 'utf-8') url_file = site.xpath('@href').extract() if (len(url_file) > 0): item['URL_Datei'] = url_file[0] item['Stadt_URL'] = unicode(self.domain, 'utf-8') #Get ALL text of everything inside the link #First any sub-elements like <span> textbits = site.xpath('child::node()') item['URL_Text'] = unicode('', 'utf-8') for text in textbits: thetext = text.xpath('text()').extract() if (len(thetext) > 0): item['URL_Text'] += thetext[0] #Then the actual text directText = site.xpath('text()').extract() #If there's something there and it isn't a repetition, use it if (len(directText) > 0) and (directText != thetext): item['URL_Text'] += directText[0] item['URL_Text'] = item['URL_Text'].replace("\t", " ").replace( "\n", "").strip() #If that got us nothing, then look at the title and alt elements title_text = site.xpath('@title').extract() if (len(title_text) > 0) and (item['URL_Text'] == u''): item['URL_Datei'] = title_text[0] alt_text = site.xpath('@alt').extract() if (len(alt_text) > 0) and (item['URL_Text'] == u''): item['URL_Datei'] = alt_text[0] item['URL_Dateiname'] = unicode(item['URL_Datei']).split('/')[-1] item['Format'] = u'Not interesting' item['geo'] = u'' item['URL_PARENT'] = parent_url item['Title_PARENT'] = parent_title #Is it a file (does it have any of the extensions (including the '.' in the filename, #then remove the '.' for ext in self.filetypes: if ext in item['URL_Dateiname'].encode( 'ascii', errors='ignore').upper(): item['Format'] = ext[1:len(ext)] #And is it one of our special geo filetypes? if ext in self.geofiletypes: item['geo'] = 'x' self.writerdata.writerow(item) self.writer.writerow(item) #items.append(item) return []
def parse_address(self, response): webpage = Website() webpage['url'] = response.url webpage['body'] = response.css( "#ctl00_PlaceHolderMain_ctl00_resultsPanel").extract() return webpage