def extract_links(self, prsdobj, pagenum): linkdict = {'docs': []} trs = prsdobj.findAll('tr') for tr in trs: pageblock, nextlink = utils.check_next_page(tr, pagenum) if nextlink: linkdict['next'] = nextlink elif not pageblock: link = self.get_judgment_info(tr) if link: linkdict['docs'].append(link) return linkdict
def result_page(self, relpath, webpage, dateobj, pagenum): newdls = [] parsedobj = utils.parse_webpage(webpage) if not parsedobj: self.logger.error(u'Could not parse the result page') return newdls tables = parsedobj.findAll('table') rtable = None for table in tables: id = table.get('id') if id == 'ctl00_ContentPlaceHolder1_OrderGridView': rtable = table if not rtable: self.logger.error(u'Result table not found') return newdls postdata = self.get_post_data(parsedobj, dateobj) trs = rtable.findAll('tr') pageblock = False nextlink = None for tr in trs: p, n = utils.check_next_page(tr, pagenum) if p: pageblock = p nextlink = n else: relurl = self.process_judgment_row(tr, relpath, postdata, \ dateobj) if relurl: newdls.append(relurl) # check if we need to recurse if pageblock: if nextlink: self.logger.info(u'Recursing after pagnum %d' % (pagenum + 1)) self.download_url(self.cookieurl, savecookies=self.cookiefile.name) webpage = self.download_link(postdata, nextlink['href']) newdls.extend(self.result_page(relpath, webpage, \ dateobj, pagenum + 1)) else: self.logger.info(u'Last page %d. No more recursing' % pagenum) return newdls
def result_page(self, relpath, webpage, dateobj, pagenum): newdls = [] parsedobj = utils.parse_webpage(webpage) if not parsedobj: self.logger.error(u'Could not parse the result page') return newdls tables = parsedobj.findAll('table') rtable = None for table in tables: id = table.get('id') if id == 'ctl00_ContentPlaceHolder1_OrderGridView': rtable = table if not rtable: self.logger.error(u'Result table not found') return newdls postdata = self.get_post_data(parsedobj, dateobj) trs = rtable.findAll('tr') pageblock = False nextlink = None for tr in trs: p, n = utils.check_next_page(tr, pagenum) if p: pageblock = p nextlink = n else: relurl = self.process_judgment_row(tr, relpath, postdata, \ dateobj) if relurl: newdls.append(relurl) # check if we need to recurse if pageblock: if nextlink: self.logger.info(u'Recursing after pagnum %d' % (pagenum+1)) self.download_url(self.cookieurl, savecookies = self.cookiefile.name) webpage = self.download_link(postdata, nextlink['href']) newdls.extend(self.result_page(relpath, webpage, \ dateobj, pagenum + 1)) else: self.logger.info(u'Last page %d. No more recursing' % pagenum) return newdls
def extract_links(self, prsdobj, pagenum): linkdict = {'docs': []} tables = prsdobj.findAll('table') grid = None for table in tables: className = table.get('class') if className == 'Grid': grid = table break if grid: trs = grid.findAll('tr') for tr in trs: link = self.get_judgment_info(tr) if link and link.has_key('title'): linkdict['docs'].append(link) else: pageblock, nextlink = utils.check_next_page(tr, pagenum) if nextlink: linkdict['next'] = nextlink else: self.logger.debug(u'Ignoring tr: %s' % tr) return linkdict
def extract_links(self, prsdobj, pagenum): linkdict = {'docs': []} tables = prsdobj.findAll('table') grid = None for table in tables: className = table.get('class') if className and className[0] == 'Grid': grid = table break if grid: trs = grid.findAll('tr') for tr in trs: link = self.get_judgment_info(tr) if link and link.has_key('title'): linkdict['docs'].append(link) else: pageblock, nextlink = utils.check_next_page(tr, pagenum) if nextlink: linkdict['next'] = nextlink else: self.logger.debug(u'Ignoring tr: %s' % tr) return linkdict