def parse(self, response): super(ClearedconnectionsSpider, self).parse(response) table = response.xpath('//table[@class="jstext"]') rows = table.xpath('.//tr')[3:] if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col_url = cols[1] onclick = col_url.xpath('.//a/@onclick').extract()[0] m = re.search("PopupViewWindow\('(.+?)'", onclick) if m: url = m.group(1) item[ 'item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url item['title'] = col_url.xpath('.//a/text()').extract()[0] col_published = cols[0] published = col_published.xpath('.//p/text()').extract()[0].strip() item['published'] = datetime2datetimestr( datetime.strptime(published, '%m/%d/%Y')) col_company = cols[3] try: item['company'] = col_company.xpath( './/a/text()').extract()[0].strip() except IndexError: item['company'] = col_company.xpath( './/p/text()').extract()[0].strip() col_loc = cols[2] loc = col_loc.xpath('./text()').extract()[0] try: item['locality'], item['region'], _ = loc.split('-') except ValueError: item['locality'] = loc # data not available in this website item['short_description'] = '' item['salary'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item}) next = response.css('.pagination').xpath( './/a[contains(.,"Next")]/@href').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request(self.base_url + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('no next url')
def parse(self, response): super(ClearedjobsSpider, self).parse(response) table = response.xpath('//table[@class="search_res"]//tbody') rows = table.xpath('.//tr') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col0 = cols[0] url = col0.xpath('.//a[@class="search"]/@href').extract()[0] item['item_url'] = self.base_url + "/" + url # a text could have span tag inside item['title'] = col0.xpath('string(.//a[@class="search"])').extract()[0].strip() item['company'] = col0.xpath('.//div[@class="info"]//a/text()').extract()[0] try: item['clearance'] = col0.xpath('.//div[@class="desc"]/text()').extract()[0] except IndexError: # item['clearance'] = '' pass published = col0.xpath('.//div[@class=""]/text()').extract()[0].replace('Posted - ','') item['published'] = datetime2datetimestr(datetime.strptime(u'July 8, 2015', '%B %d, %Y')) col1 = cols[1] loc = col1.xpath('./text()').extract()[0].strip() try: item['locality'] , item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['short_description'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.navbar_bottom').xpath('.//a[text()=">"]/@href').extract() if next: if next[0].startswith('/'): self.logger.debug('next url: %s' % self.base_url +next[0]) yield Request( self.base_url + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('next url: %s' % self.base_url + '/' +next[0]) yield Request( self.base_url + '/' + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(ClearancejobsSpider, self).parse(response) results = response.css('#search-results') # div rows = results.css('.cj-search-result-item') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) item['title'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/text()').extract()[0].strip() item['item_url'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/@href').extract()[0] try: item['company'] = row.xpath('.//span[@class="cj-company-name"]/a/text()').extract()[0].strip() except IndexError: item['company'] = '' #location = row.xpath('.//span[@class="cj-company-name"]/text()').extract()[0].strip() location = row.css('.cj-job-primary-info::text').extract()[1].strip() try: item['locality'] , item['region'] = location.split(', ') except ValueError: item['locality'] = location item['region'] = '' # attributes not provided item['short_description'] = '' item['salary'] = '' item['department'] = '' #updated = row.css('.cj-text-sm::text').extract()[1].strip() updated = row.xpath('.//span[@class="cj-text-sm cj-color-mediumgray"]//text()').extract()[1].strip() try: dt = datetime.strptime(updated, '%m/%d/%y') # FIXME: check if exception not only cause of Today string but other string like yesterday except ValueError: dt = datetime.now() item['published'] = datetime2datetimestr(dt) #item['clearance'] = row.xpath('.//div[@class="cj-card-data"])').extract()[0].strip() item['clearance'] = row.css('.cj-card-data::text').extract()[1] self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) #FIXME: no a link, javascript next = response.xpath('button[@class="cj-table-pagination-next"]/a/@href').extract() if next: self.logger.debug('next url: %s' % next[0]) yield Request( next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(ClearedconnectionsSpider, self).parse(response) table = response.xpath('//table[@class="jstext"]') rows = table.xpath('.//tr')[3:] if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col_url = cols[1] onclick = col_url.xpath('.//a/@onclick').extract()[0] m = re.search("PopupViewWindow\('(.+?)'", onclick) if m: url = m.group(1) item['item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url item['title'] = col_url.xpath('.//a/text()').extract()[0] col_published = cols[0] published = col_published.xpath('.//p/text()').extract()[0].strip() item['published'] = datetime2datetimestr(datetime.strptime(published,'%m/%d/%Y')) col_company = cols[3] try: item['company'] = col_company.xpath('.//a/text()').extract()[0].strip() except IndexError: item['company'] = col_company.xpath('.//p/text()').extract()[0].strip() col_loc = cols[2] loc = col_loc.xpath('./text()').extract()[0] try: item['locality'] , item['region'], _ = loc.split('-') except ValueError: item['locality'] = loc # data not available in this website item['short_description'] = '' item['salary'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.pagination').xpath('.//a[contains(.,"Next")]/@href').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request( self.base_url + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(ClearedjobsSpider, self).parse(response) table = response.xpath('//table[@class="search_res"]//tbody') rows = table.xpath('.//tr') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) cols = row.xpath('.//td') col0 = cols[0] url = col0.xpath('.//a[@class="search"]/@href').extract()[0] item['item_url'] = self.base_url + "/" + url # a text could have span tag inside item['title'] = col0.xpath( 'string(.//a[@class="search"])').extract()[0].strip() item['company'] = col0.xpath( './/div[@class="info"]//a/text()').extract()[0] try: item['clearance'] = col0.xpath( './/div[@class="desc"]/text()').extract()[0] except IndexError: # item['clearance'] = '' pass published = col0.xpath( './/div[@class=""]/text()').extract()[0].replace( 'Posted - ', '') item['published'] = datetime2datetimestr( datetime.strptime(u'July 8, 2015', '%B %d, %Y')) col1 = cols[1] loc = col1.xpath('./text()').extract()[0].strip() try: item['locality'], item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['short_description'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item}) next = response.css('.navbar_bottom').xpath( './/a[text()=">"]/@href').extract() if next: if next[0].startswith('/'): self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request(self.base_url + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('next url: %s' % self.base_url + '/' + next[0]) yield Request(self.base_url + '/' + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('no next url')