Python table2dict 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scrapyscrappers.util

메소드/함수: table2dict

hotexamples.com에서의 예제들: 4

Python table2dict - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scrapyscrappers.util.table2dict에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: usajobs_crawler.py 프로젝트: davete/scrapyscrappers

 def parse_item(self,  response):
     self.logger.debug('parse_item %s' % response.url, log_level=log.DEBUG)
     item = response.meta['item']
     soup = bs4.BeautifulSoup(response.body)          
     # FIXME: uncomment when not debugging
     #item['description'] = soup.select('div.jobdetail')[0].text
     infodict = table2dict(soup,  'div#jobinfo2')
     item['clearance'] = infodict.get('SECURITY CLEARANCE')
     yield item

예제 #2

파일 보기

파일: usajobs.py 프로젝트: scottmcclenning/scrapyscrappers

 def parse_item(self,  response):
     item = super(UsajobsSpider, self).parse_item(response)
     soup = bs4.BeautifulSoup(response.body)          
     # item['description'] = soup.select('div.jobdetail')[0].text # with soup, plan text
     try:
         item['description'] = response.css('div.jobdetail')[0].extract() 
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     else:
         infodict = table2dict(soup,  'div#jobinfo2')
         item['clearance'] = infodict.get('SECURITY CLEARANCE')
     yield item

예제 #3

파일 보기

파일: usajobs.py 프로젝트: scottmcclenning/scrapyscrappers

 def parse(self, response):
     super(UsajobsSpider,  self).parse(response)
     soup = bs4.BeautifulSoup(response.body)
     soupitems = soup.select('div#jobResultNew')    
     if len(soupitems) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for soupitem in soupitems:
         item = self.init_item(response)
         item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href')
         item['title'] = soupitem.select('a.jobTitleLink')[0].text
         item['short_description'] = soupitem.select('p.summary')[0].text.strip()
         details = table2dict(soupitem,  'table.joaResultsDetailsTable')
         item['company'] = details.get('Agency',  '')
         location_region = details.get('Location(s)',  '').split(', ')
         item['locality'] = location_region[0]
         try:
             item['region'] = location_region[1]
         except IndexError:
             pass
         item['salary'] = details.get('Salary',  '')
         item['department'] = details.get('Department',  '')
         # data not available in this website
         item['published']= ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     # next = soup.select('a.nextPage') # with soup
     next = response.css('a.nextPage::attr(href)').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(
             # self.base_url + next[0]['href'],  # with soup
             self.base_url + next[0], 
             callback=self.parse, 
             meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
         )
     else:
         self.logger.debug('no next url')

예제 #4

파일 보기

파일: usajobs_crawler.py 프로젝트: davete/scrapyscrappers

 def parse(self, response):
     self.logger.debug('in parse', log_level=log.DEBUG)
     soup = bs4.BeautifulSoup(response.body)
     soupitems = soup.select('div#jobResultNew')    
     for soupitem in soupitems:
         item = ScrapyscrappersItem()
         item['keyword'] = response.meta['keyword']
         item['date_search'] = current_datetime()
         item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href')
         item['title'] = soupitem.select('a.jobTitleLink')[0].text
         item['short_description'] = soupitem.select('p.summary')[0].text.strip()
         details = table2dict(soupitem,  'table.joaResultsDetailsTable')
         item['company'] = details.get('Agency',  '')
         location_region = details.get('Location(s)',  '').split(', ')
         item['locality'] = location_region[0]
         try:
             item['region'] = location_region[1]
         except IndexError:
             pass
         item['salary'] = details.get('Salary',  '')
         item['department'] = details.get('Department',  '')
         # item.published = ''
         self.logger.debug('title %s' % item['title'], log_level=log.DEBUG)
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )