def parse_search_results(self, webpage, dateobj, curr_page): metainfos = [] nextpage = None d = utils.parse_webpage(webpage, self.parser) if not d: self.logger.warn('Unable to parse search result page for %s', dateobj) return metainfos, nextpage tables = d.find_all('table', {'id': self.result_table}) if len(tables) != 1: self.logger.warn('Could not find the result table for %s', dateobj) return metainfos, nextpage order = None for tr in tables[0].find_all('tr'): if not order: order = self.get_column_order(tr) continue if nextpage == None: nextpage = self.find_next_page(tr, curr_page) if nextpage != None: continue if tr.find('input') == None and tr.find('a') == None: continue self.process_result_row(tr, metainfos, dateobj, order) return metainfos, nextpage
def download_extraordinary(self, dls, relpath, dateobj): ex_url = urllib.basejoin(self.baseurl, self.extraordinary_url % dateobj.year) response = self.download_url(ex_url) if not response or not response.webpage: self.logger.warn( 'Unable to download Extraordinary gazette for year %d', dateobj.year) return d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn( 'Unable to parse Extraordinary gazette list for year %d', dateobj.year) return if dateobj.year == 2010: minfos = self.parse_listing_webpage(ex_url, d, dateobj, None, 'Extraordinary') else: minfos = self.parse_extraordinary_webpage(d, dateobj, ex_url) self.download_metainfos(minfos, dls, relpath)
def download_info_page(self, url): webpage = self.download_url(url) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse the date search page') return [], None links = d.findAll('a') infolist = [] previousurl = None for link in links: href = link.get('href') if previousurl == None and href: anchortext = utils.get_tag_contents(link) if anchortext and re.search('Previous >>', anchortext): previousurl = urllib.basejoin(url, href) if href: if re.match('judgements', href): node = link while node.name != 'tr': node = node.parent if node: metainfo = self.get_meta_info(node) metainfo['href'] = href infolist.append(metainfo) self.logger.debug('metainfo: %s' % metainfo) return infolist, previousurl
def process_result_page(self, relpath, dateobj, webpage): newdls = [] d = utils.parse_webpage(webpage) if not d: self.logger.info(u'Could not parse result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: pagetype = self.page_type(tr) if pagetype == 'nextlink': nextlink = self.next_link(tr.findAll('a')) if nextlink: self.logger.info(u'Going to the next page: %s' \ % utils.get_tag_contents(nextlink)) rels = self.process_next_link(relpath, dateobj, nextlink) newdls.extend(rels) elif pagetype == 'judgment': rel = self.handle_judgment_link(relpath, dateobj, tr) if rel: newdls.append(rel) else: self.logger.info(u'Not processing %s' % tr) return newdls
def parse_html_page(self, htmlPage): result = {} docs = [] d = utils.parse_webpage(htmlPage) if d: trs = d.findAll('tr') for tr in trs: if tr.find('table'): continue links = tr.findAll('a') href = None if len(links) > 1: for a in links: h = a.get(self.HREF) if h and re.search('pdf$', h): href = h elif len(links) == 1 and links[0].get(self.HREF): href = links[0].get(self.HREF) if href and (not re.search('/itat/upload/blank.htm$', href)): metainfo = self.get_meta_info(tr) if metainfo: href = links[0].get(self.HREF) if href: metainfo[self.HREF] = href docs.append(metainfo) if docs: result[self.DOCS] = docs return result
def download_oneday(self, relpath, dateobj): dls = [] postdata = self.get_post_data(dateobj) response = self.download_url(self.searchurl, postdata=postdata) if not response or not response.webpage: self.logger.warn('Could not download search result for date %s', \ dateobj) return dls d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn('Could not parse search result for date %s', \ dateobj) return dls minfos = self.get_metainfos(d, dateobj) for metainfo in minfos: if 'docid' not in metainfo: self.logger.warn('Ignoring metainfo: %s', metainfo) continue filename = metainfo.pop('docid') relurl = os.path.join(relpath, filename) gzurl = self.get_doc_url(filename) if self.save_gazette(relurl, gzurl, metainfo): dls.append(relurl) return dls
def download_oneday(self, relpath, dateobj): dls = [] datestr = utils.dateobj_to_str(dateobj, '-') searchurl = self.searchurl % (datestr, datestr) response = self.download_url(searchurl) if not response or not response.webpage: self.logger.warn('Could not download search result for date %s', \ dateobj) return dls d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn('Could not parse search result for date %s', \ dateobj) return dls minfos = self.parse_results(d, dateobj) for metainfo in minfos: if 'download' not in metainfo: self.logger.warn('No link. Ignoring metainfo: %s', metainfo) continue relurl = self.download_gazette(metainfo, searchurl, relpath) if relurl: dls.append(relurl) return dls
def result_page(self, relpath, url, dateobj, linkdict): newdls = [] webpage = self.download_url(url, loadcookies = self.cookiefile.name) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title) or linkdict.has_key(href): self.logger.warning(u'Could not process %s' % link) continue linkdict[href] = 1 action = self.action_on_link(href, title) self.logger.info(u'Action %s on link %s title %s' %\ (action, href, title)) newurl = urllib.basejoin(url, href) if action == 'judgmentlink': relurl = self.process_judgment_page(relpath, newurl, dateobj) if relurl: newdls.append(relurl) else: self.logger.warning(u'Judgment link not working %s' % newurl) elif action == 'recurse': newdls.extend(self.result_page(relpath, newurl, dateobj, linkdict)) return newdls
def handle_result_page(self, resultpage, relpath, dateobj): dls = [] d = utils.parse_webpage(resultpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) # download judgments trs = d.findAll('tr') for tr in trs: links = tr.findAll('a') if len(links) == 1: relurl = self.dl_judgment(relpath, tr, links[0], dateobj) if relurl: dls.append(relurl) else: self.logger.warning(u'No action for %s' % tr) # next page links = d.findAll('a') for link in links: href = link.get('href') t = utils.get_tag_contents(link) if href and t == 'Next': nexturl = urllib.basejoin(self.resulturl, href) resultpage = self.download_url(nexturl, \ loadcookies = self.cookiefile.name) if resultpage: self.logger.info(u'Recursing to %s' % nexturl) dls.extend(self.handle_result_page(resultpage, relpath, \ dateobj)) return dls
def download_oneday(self, relpath, dateobj): dls = [] webpage = self.download_url(self.dateurl, savecookies = \ self.cookiefile.name) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse date search page') return dls forms = d.findAll('form') action = None for form in forms: if form.get('name') == 'WebJudgmentDateSearchForm': action = form.get('action') break if not action: self.logger.error(u'Could not find date search form') return dls posturl = urllib.basejoin(self.baseurl, action) postdata = self.post_data(d, dateobj) webpage = self.download_url(posturl, postdata = postdata, \ loadcookies = self.cookiefile.name) dls = self.dl_result_page(relpath, posturl, webpage, dateobj, {}) return dls
def datequery_result(self, dateqryUrl, webpage, relpath, \ dateobj, currentpage): newdls = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls stateval = self.extract_state(d) if stateval != None and stateval != self.stateval: self.stateval = stateval self.logger.info(u'stateval changed') docs = self.extract_docs(d) for doc in docs: if doc.has_key('href'): href = doc['href'] reobj = re.search('\d+$', href) if reobj: filename = href[reobj.start():reobj.end()] dlurl = urllib.basejoin(dateqryUrl, href) rel = os.path.join (relpath, filename) success = self.download_debate(rel, dlurl, doc, dateobj) if success: newdls.append(rel) nextpage = self.get_next_page(d, dateqryUrl, currentpage) if nextpage: newdls.extend(self.datequery_result(dateqryUrl, nextpage, relpath, \ dateobj, currentpage + 1)) return newdls
def download_info_page(self, url): dls = [] nextPage = None webpage = self.download_url(url) if webpage: d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse the date search page') return [], None nextPage = self.get_next_page(d, url) maxtr = -1 mainTable = None tables = d.findAll('table') for table in tables: numtrs = table.findAll('tr') if numtrs > maxtr: mainTable = table maxtr = numtrs if mainTable: trs = table.findAll('tr') for tr in trs: metainfo = self.get_meta_info(tr) if metainfo and metainfo.has_key(self.DATE): self.logger.debug(u'metainfo: %s' % metainfo) dls.append(metainfo) dls.sort(cmp = lambda x, y: cmp(x[self.DATE], y[self.DATE]), \ reverse= True) return dls, nextPage
def download_orders(self, relpath, ccin, dateobj, webpage): parsedDoc = utils.parse_webpage(webpage) if not parsedDoc: self.logger.warning(u'Could not parse judgments list for doc: ccin %s date: %s' % (ccin, dateobj)) return [] trs = parsedDoc.findAll('tr') fieldOrder = self.get_order_of_fields(parsedDoc) newdls = [] if 'view' in fieldOrder and 'caseno' in fieldOrder \ and 'date' in fieldOrder: for tr in trs: if tr.find('th'): continue relurl = self.process_order_tr(ccin, relpath, dateobj, \ tr, fieldOrder) if relurl: newdls.append(relurl) else: self.logger.warning(u'Could not get judgment in tr: %s' % tr) else: self.logger.warning(u'Could not get field ordering in ccin %s date: %s' % (ccin, dateobj)) return newdls
def download_info_page(self, url): webpage = self.download_url(url) dls = [] if webpage: d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse the date search page') return [], None maxtr = -1 mainTable = None tables = d.findAll('table') for table in tables: numtrs = table.findAll('tr') if numtrs > maxtr: mainTable = table maxtr = numtrs if mainTable: trs = table.findAll('tr') for tr in trs: metainfos = self.get_meta_info(tr, url) if metainfos: dls.extend(metainfos) return dls, None
def process_judgment_page(self, relpath, url, dateobj): webpage = self.download_url(url, loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download %s' % url) return None d = utils.parse_webpage(webpage) if not d: self.logger.warning(u'Could not parse %s' % url) return None metainfo = self.get_meta_info(d, dateobj) for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.warning(u'Could not process %s' % link) continue action = self.action_on_link(href, title) newurl = urllib.basejoin(url, href) if action == 'save': self.logger.info(u'Downloading %s' % newurl) return self.get_judgment(relpath, newurl, title, metainfo) return None
def download_oneday(self, relpath, dateobj): dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php') postdata = [('d1', dateobj.day), ('m1', dateobj.month), \ ('y1', dateobj.year), ('d2', dateobj.day), \ ('m2', dateobj.month), ('y2', dateobj.year), \ ('button', 'Submit')] webpage = self.download_url(dateurl, postdata = postdata) if not webpage: self.logger.warning(u'No webpage for %s date: %s' % \ (dateurl, dateobj)) return [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'HTML parsing failed for date: %s' % dateobj) return [] newdls = [] for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.warning(u'Could not process %s' % link) continue words = href.split('/') filename = words[-1] url = urllib.basejoin(dateurl, href) self.logger.info(u'link: %s title: %s' % (href, title)) relurl = os.path.join (relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url) if not webpage: self.logger.warning(u'No webpage %s' % url) else: utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % url) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(title, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls
def get_stateval(self, url): webpage = self.download_url(url, \ savecookies = self.cookiefile.name) d = utils.parse_webpage(webpage) if d: return self.extract_state(d) else: return None
def datequery_result(self, webpage, relpath, pagenum, dateobj): downloaded = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return downloaded stateval = self.extract_state(d) if stateval != None and stateval != self.stateval: self.stateval = stateval self.logger.info(u'stateval changed') linkdict = self.extract_links(d, pagenum) for link in linkdict['docs']: if (not link.has_key('title')) or (not link.has_key('href')): continue self.logger.info(u'Processing link: %s href: %s' % \ (link['title'], link['href'])) filename = re.sub('/', '|', link['title']) filename = re.sub("'", ' ', filename) tmprel = os.path.join (relpath, filename) rawpath = os.path.join (self.rawdir, tmprel) metapath = os.path.join (self.metadir, tmprel) if not os.path.exists(rawpath): webpage = self.download_judgment(link) if webpage: utils.save_file(rawpath, webpage) else: self.logger.warning(u'Could not download %s' % \ link['title']) if os.path.exists(rawpath) and not os.path.isdir(rawpath): if not os.path.exists(metapath) or self.updateMeta: self.save_meta_tags(metapath, link, dateobj) downloaded.append(tmprel) if linkdict.has_key('next'): link = linkdict['next'] self.logger.info(u'Following page: %s href: %s' % \ (link['title'], link['href'])) webpage = self.download_link(link) if webpage: nextdownloads = self.datequery_result(webpage, relpath, \ pagenum + 1, dateobj) downloaded.extend(nextdownloads) else: self.logger.warning(u'Could not download %s' % link['title']) return downloaded
def datequery_result(self, webpage, relpath, pagenum, dateobj): downloaded = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return downloaded stateval = self.extract_state(d) if stateval != None and stateval != self.stateval: self.stateval = stateval self.logger.info(u'stateval changed') linkdict = self.extract_links(d, pagenum) for link in linkdict['docs']: if (not link.has_key('title')) or (not link.has_key('href')): continue self.logger.info(u'Processing link: %s href: %s' % \ (link['title'], link['href'])) filename = re.sub('/', '|', link['title']) filename = re.sub("'", ' ', filename) tmprel = os.path.join (relpath, filename) rawpath = os.path.join (self.rawdir, tmprel) metapath = os.path.join (self.metadir, tmprel) if not os.path.exists(rawpath): webpage = self.download_link(link) if webpage: utils.save_file(rawpath, webpage) else: self.logger.warning(u'Could not download %s' % \ link['title']) if os.path.exists(rawpath) and not os.path.isdir(rawpath): if not os.path.exists(metapath) or self.updateMeta: self.save_meta_tags(metapath, link, dateobj) downloaded.append(tmprel) if linkdict.has_key('next'): link = linkdict['next'] self.logger.info(u'Following page: %s href: %s' % \ (link['title'], link['href'])) webpage = self.download_link(link) if webpage: nextdownloads = self.datequery_result(webpage, relpath, \ pagenum + 1, dateobj) downloaded.extend(nextdownloads) else: self.logger.warning(u'Could not download %s' % link['title']) return downloaded
def result_url(self, url, relpath, pagelist, dateobj): newdls = [] webpage = self.download_url(url, loadcookies=self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download %s' % url) return newdls webpage = re.sub('""', '"', webpage) startobj = re.search('<table', webpage) endobj = re.search('</table>', webpage) if not startobj or not endobj or startobj.start() >= endobj.end(): self.logger.warning(u'No table found') return newdls d = utils.parse_webpage(webpage[startobj.start():endobj.end()]) if not d: self.logger.error( u'Could not parse html of the result page for url %s' % url) return newdls trs = d.findAll('tr') for tr in trs: tds = tr.findAll('td') links = tr.findAll('a') metainfo = self.get_meta_info(tds, dateobj) for link in links: relurl = link.get('href') if not relurl: self.logger.warning(u'No href in %s' % link) continue action = self.action_on_url(relurl) self.logger.info(u'Action %s on %s' % (action, relurl)) if action == 'ignore': continue url = urllib.basejoin(self.courturl, relurl) if action == 'save': rel = self.get_judgment(url, relpath, metainfo) if rel: newdls.append(rel) elif action == 'recurse': page = self.find_page(url) if page and (page not in pagelist): self.logger.info(u'recursing %s' % url) pagelist.append(page) newdls.extend(self.result_url(url, relpath, pagelist, \ dateobj)) else: self.logger.info(u'Not recursing %s' % url) return newdls
def result_url(self, url, relpath, pagelist, dateobj): newdls = [] webpage = self.download_url(url, loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download %s' % url) return newdls webpage = re.sub('""', '"', webpage) startobj = re.search('<table', webpage) endobj = re.search('</table>', webpage) if not startobj or not endobj or startobj.start() >= endobj.end(): self.logger.warning(u'No table found') return newdls d = utils.parse_webpage(webpage[startobj.start():endobj.end()]) if not d: self.logger.error(u'Could not parse html of the result page for url %s' % url) return newdls trs = d.findAll('tr') for tr in trs: tds = tr.findAll('td') links = tr.findAll('a') metainfo = self.get_meta_info(tds, dateobj) for link in links: relurl = link.get('href') if not relurl: self.logger.warning(u'No href in %s' % link) continue action = self.action_on_url(relurl) self.logger.info(u'Action %s on %s' % (action, relurl)) if action == 'ignore': continue url = urllib.basejoin(self.courturl, relurl) if action == 'save': rel = self.get_judgment(url, relpath, metainfo) if rel: newdls.append(rel) elif action == 'recurse': page = self.find_page(url) if page and (page not in pagelist): self.logger.info(u'recursing %s' % url) pagelist.append(page) newdls.extend(self.result_url(url, relpath, pagelist, \ dateobj)) else: self.logger.info(u'Not recursing %s' % url) return newdls
def get_stateval(self, url): webpage = self.download_url(url, \ savecookies = self.cookiefile.name) d = utils.parse_webpage(webpage) if d: return self.extract_state(d) else: self.logger.error(u'Could not parse the date webpage') return None
def download_captcha(self, search_url, webpage, cookiejar): d = utils.parse_webpage(webpage, self.parser) if d == None: return None imgs = d.find_all('img') for img in imgs: src = img.get('src') if src and src.find('CaptchaImage.axd') >= 0: captcha_url = urllib.basejoin(search_url, src) return self.download_url(captcha_url, loadcookies=cookiejar) return None
def get_search_form(self, webpage, dateobj): if webpage == None: self.logger.warn('Unable to download the starting search page for day: %s', dateobj) return None d = utils.parse_webpage(webpage, self.parser) if d == None: self.logger.warn('Unable to parse the search page for day: %s', dateobj) return None search_form = self.find_search_form(d) return search_form
def result_page(self, relpath, webpage, dateobj, pagenum): newdls = [] parsedobj = utils.parse_webpage(webpage) if not parsedobj: self.logger.error(u'Could not parse the result page') return newdls tables = parsedobj.findAll('table') rtable = None for table in tables: id = table.get('id') if id == 'ctl00_ContentPlaceHolder1_OrderGridView': rtable = table if not rtable: self.logger.error(u'Result table not found') return newdls postdata = self.get_post_data(parsedobj, dateobj) trs = rtable.findAll('tr') pageblock = False nextlink = None for tr in trs: p, n = utils.check_next_page(tr, pagenum) if p: pageblock = p nextlink = n else: relurl = self.process_judgment_row(tr, relpath, postdata, \ dateobj) if relurl: newdls.append(relurl) # check if we need to recurse if pageblock: if nextlink: self.logger.info(u'Recursing after pagnum %d' % (pagenum + 1)) self.download_url(self.cookieurl, savecookies=self.cookiefile.name) webpage = self.download_link(postdata, nextlink['href']) newdls.extend(self.result_page(relpath, webpage, \ dateobj, pagenum + 1)) else: self.logger.info(u'Last page %d. No more recursing' % pagenum) return newdls
def download_oneday(self, relpath, dateobj): self.download_url(self.cookieurl, savecookies=self.cookiefile.name) webpage = self.download_url(self.dateurl, loadcookies=self.cookiefile.name) parsedobj = utils.parse_webpage(webpage) if not parsedobj: self.logger.error(u"Could not parse the date search page") return [] postdata = self.get_post_data(parsedobj, dateobj) webpage = self.download_url( self.dateurl, postdata=postdata, loadcookies=self.cookiefile.name, referer=self.dateurl ) return self.result_page(relpath, webpage, dateobj, 1)
def result_page(self, webpage, relpath, dateobj): newdls = [] if not webpage: return newdls d = utils.parse_webpage(webpage) if not d: self.logger.error( u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: continue href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.info(u'Could not process %s' % link) continue if not re.match('\d+$', title) and not re.search( 'PREV|NEXT', title): self.logger.info(u'link: %s title: %s' % (href, title)) rel = self.handle_judgment_link(relpath, tr, dateobj, href, title) if rel: newdls.append(rel) if newdls: links = d.findAll('a') for link in links: href = link.get('href') title = utils.get_tag_contents(link) if title and href and re.match('NEXT', title): self.logger.info(u'Following next page link: %s' % link) webpage = self.download_url(urllib.basejoin(self.baseurl,href),\ loadcookies = self.cookiefile.name) newdls.extend(self.result_page(webpage, relpath, dateobj)) return newdls
def result_page(self, relpath, webpage, dateobj, pagenum): newdls = [] parsedobj = utils.parse_webpage(webpage) if not parsedobj: self.logger.error(u'Could not parse the result page') return newdls tables = parsedobj.findAll('table') rtable = None for table in tables: id = table.get('id') if id == 'ctl00_ContentPlaceHolder1_OrderGridView': rtable = table if not rtable: self.logger.error(u'Result table not found') return newdls postdata = self.get_post_data(parsedobj, dateobj) trs = rtable.findAll('tr') pageblock = False nextlink = None for tr in trs: p, n = utils.check_next_page(tr, pagenum) if p: pageblock = p nextlink = n else: relurl = self.process_judgment_row(tr, relpath, postdata, \ dateobj) if relurl: newdls.append(relurl) # check if we need to recurse if pageblock: if nextlink: self.logger.info(u'Recursing after pagnum %d' % (pagenum+1)) self.download_url(self.cookieurl, savecookies = self.cookiefile.name) webpage = self.download_link(postdata, nextlink['href']) newdls.extend(self.result_page(relpath, webpage, \ dateobj, pagenum + 1)) else: self.logger.info(u'Last page %d. No more recursing' % pagenum) return newdls
def download_oneday(self, relpath, dateobj): self.download_url(self.cookieurl, savecookies=self.cookiefile.name) webpage = self.download_url(self.dateurl, \ loadcookies = self.cookiefile.name) parsedobj = utils.parse_webpage(webpage) if not parsedobj: self.logger.error(u'Could not parse the date search page') return [] postdata = self.get_post_data(parsedobj, dateobj) webpage = self.download_url(self.dateurl, postdata = postdata, \ loadcookies = self.cookiefile.name, \ referer = self.dateurl) return self.result_page(relpath, webpage, dateobj, 1)
def datepage_metainfos(self, url, dateobj): minfos = [] response = self.download_url(url) if not response or not response.webpage: self.logger.warn('Unable to download %s. Skipping', url) return minfos d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn('Unable to parse %s. Skipping.', url) return minfos partnum = None dept = None for td in d.find_all('td'): bgcolor = td.get('bgcolor') links = td.find_all('a') if bgcolor == '#91BAE8' and len(links) == 0: partnum = utils.get_tag_contents(td) partnum = utils.remove_spaces(partnum) dept = None elif len(links) > 0: reobj = re.compile('^(strong|a)$') for x in td.find_all(reobj): if x.name == 'strong': dept = utils.get_tag_contents(x) dept = utils.remove_spaces(dept) elif x.name == 'a' and partnum: href = x.get('href') if not href.startswith('pdf'): continue title = utils.get_tag_contents(x) title = utils.remove_spaces(title) metainfo = utils.MetaInfo() minfos.append(metainfo) metainfo.set_title(title) metainfo.set_date(dateobj) metainfo['partnum'] = partnum if dept: metainfo['department'] = dept gzurl = urllib.basejoin(url, href) metainfo['url'] = gzurl return minfos
def parse_result_page(self, posturl, webpage, dateobj): judgments = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) return judgments # get judgments trs = d.findAll('tr') for tr in trs: judgment = {} metainfo = { 'date': utils.date_to_xml(dateobj)} links = tr.findAll('a') for link in links: href = link.get('href') if href and re.search('WebShowJudgment.do', href): t = utils.get_tag_contents(link) colon = t.find(':') if colon: title = t[colon+1:] title = title.strip() metainfo['title'] = title reobj = re.search(' vs\. ', title, re.IGNORECASE) if reobj: metainfo['petitioner'] = title[:reobj.start()] metainfo['respondent'] = title[reobj.end():] if href and re.search('WebDownloadJudgmentDocument.do', href): judgment['link'] = urllib.basejoin(posturl, href) if judgment: judgment['metainfo'] = metainfo judgments.append(judgment) # next link links = d.findAll('a') for link in links: t = utils.get_tag_contents(link) if re.search('Next', t): href = link.get('href') if href: judgment = {'link': urllib.basejoin(posturl, href)} judgment['next'] = True judgments.append(judgment) return judgments
def parse_result_page(self, posturl, webpage, dateobj): judgments = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) return judgments # get judgments trs = d.findAll('tr') for tr in trs: judgment = {} metainfo = {'date': utils.date_to_xml(dateobj)} links = tr.findAll('a') for link in links: href = link.get('href') if href and re.search('WebShowJudgment.do', href): t = utils.get_tag_contents(link) colon = t.find(':') if colon: title = t[colon + 1:] title = title.strip() metainfo['title'] = title reobj = re.search(' vs\. ', title, re.IGNORECASE) if reobj: metainfo['petitioner'] = title[:reobj.start()] metainfo['respondent'] = title[reobj.end():] if href and re.search('WebDownloadJudgmentDocument.do', href): judgment['link'] = urllib.basejoin(posturl, href) if judgment: judgment['metainfo'] = metainfo judgments.append(judgment) # next link links = d.findAll('a') for link in links: t = utils.get_tag_contents(link) if re.search('Next', t): href = link.get('href') if href: judgment = {'link': urllib.basejoin(posturl, href)} judgment['next'] = True judgments.append(judgment) return judgments
def parse_metainfos(self, webpage, year, fromdate, todate): minfos = [] nextpage = None d = utils.parse_webpage(webpage, self.parser) if not d: self.logger.warn('Unable to parse results page for year %d', year) return minfos for td in d.find_all('td'): link = td.find('a') if link == None: continue img = td.find('img') if img: title = img.get('title') if title == 'Next' and nextpage == None: nextpage = link continue metainfo = self.get_metainfo(link, td) if metainfo: dateobj = metainfo.get_date() if dateobj and dateobj >= fromdate and dateobj <= todate: minfos.append(metainfo) paras = td.find_all('p') if len(paras) >= 2: p = paras[1] txt = utils.get_tag_contents(p) reobj = re.search( 'Department:\s*(?P<dept>.+)\s+Order\s+Nos:\s*(,Othres\s*:)?(?P<ordernum>.*)', txt) if reobj: groupdict = reobj.groupdict() ordernum = groupdict['ordernum'].strip() metainfo['department'] = groupdict['dept'].strip() if re.match('[\d+(,\s*)?]+$', ordernum): metainfo['ordernum'] = ordernum if len(paras) >= 3: p = paras[2] txt = utils.get_tag_contents(p) if txt: metainfo.set_subject(txt) return minfos, nextpage
def download_oneday(self, relpath, dateobj): newdls = [] datestr = utils.dateobj_to_str(dateobj, '/') subrelpath = '/'.join(relpath.split('/')[:-1]) postdata = [('hcjudgecode', ''), ('fromdate', datestr), \ ('todate', datestr), ('counter', '1')] webpage = self.download_url (self.pageurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name, \ postdata = postdata) if not webpage: self.logger.warning(u'No webpage for %s' % self.pageurl) return newdls d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: if tr.find('th'): continue onclick = tr.get('onclick') if not onclick: self.logger.info(u'No onclick in %s' % tr) continue reobj = re.search('\d+', onclick) if not reobj: continue ccin = reobj.group(0) webpage = self.download_url (self.caseurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name, \ postdata = [('ccin', ccin)]) if not webpage: self.logger.error(u'Could not get case for %s on date %s' % (ccin, dateobj)) continue newdls.extend(self.download_orders(subrelpath, ccin, dateobj, webpage)) return newdls
def result_page(self, webpage, relpath, dateobj, linkdict): newdls = [] if not webpage: return newdls courtParser = utils.parse_webpage(webpage) if not courtParser: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = courtParser.findAll('tr') for tr in trs: link = tr.find('a') if link: title = utils.get_tag_contents(link) href = link.get('href') if (not title) or (not href): self.logger.warning(u'Could not process %s' % link) continue if linkdict.has_key(href): continue if not re.search('first|prev|next|last|acroread', title, \ re.IGNORECASE): linkdict[href] = 1 dl = self.handle_link(relpath, href, title, tr, dateobj) if dl: newdls.append(dl) elif title == 'Next': self.logger.info(u'Following Next page %s' % href) newlink = urllib.basejoin (self.baseurl, href) webpage = self.download_url(newlink, \ loadcookies = self.cookiefile.name) newdls.extend(self.result_page(webpage, relpath, dateobj, \ linkdict)) else: self.logger.info(u'No action for %s' % href) return newdls
def result_page(self, webpage, relpath, dateobj): newdls = [] if not webpage: return newdls d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: continue href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.info(u'Could not process %s' % link) continue if not re.match('\d+$', title) and not re.search('PREV|NEXT',title): self.logger.info(u'link: %s title: %s' % (href, title)) rel = self.handle_judgment_link(relpath, tr, dateobj, href, title) if rel: newdls.append(rel) if newdls: links = d.findAll('a') for link in links: href = link.get('href') title = utils.get_tag_contents(link) if title and href and re.match('NEXT', title): self.logger.info(u'Following next page link: %s' % link) webpage = self.download_url(urllib.basejoin(self.baseurl,href),\ loadcookies = self.cookiefile.name) newdls.extend(self.result_page(webpage, relpath, dateobj)) return newdls
def download_info_page(self, url): infolist = [] webpage = self.download_url(url) if webpage: d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse the date search page') return [], None tables = d.findAll('table') for table in tables: if not table.find('table'): trs = table.findAll('tr') for tr in trs: metainfo = self.get_meta_info(tr) if metainfo: self.logger.debug('metainfo: %s' % metainfo) infolist.append(metainfo) return infolist, None
def get_search_results(self, search_url, dateobj, cookiejar): response = self.download_url(search_url, savecookies = cookiejar, \ loadcookies=cookiejar) while response and response.webpage: response = self.submit_captcha_form(search_url, response.webpage, \ cookiejar, dateobj) if not response or not response.webpage: break d = utils.parse_webpage(response.webpage, self.parser) if d and not self.is_form_webpage(d): break else: self.logger.warn('Failed in solving captcha. Rerying.') cookiejar = CookieJar() response = self.download_url(search_url, savecookies=cookiejar) return response
def download_oneday(self, relpath, dateobj): dls = [] response = self.download_url(self.baseurl) if not response or not response.webpage: self.logger.error('Unable to download the webpage for %s', dateobj) return dls d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.error('Unable to parse the webpage for %s', dateobj) return dls categories = self.find_categories(d) for category in categories: dls.extend(self.download_onecat(relpath, dateobj, category)) break return dls
def download_oneday(self, relpath, dateobj): dls = [] postdata = self.get_post_data(dateobj) response = self.download_url(self.baseurl, postdata=postdata) if not response or not response.webpage: self.logger.warn('Unable to get result page for date %s', dateobj) return dls d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn('Unable to parse result page for date %s', dateobj) return dls result_table = None for table in d.find_all('table'): for tr in table.find_all('tr'): order = self.find_field_order(tr) if order: result_table = table break if result_table == None: self.logger.warn('Unable to find the result table for %s', dateobj) return dls minfos = [] for tr in result_table.find_all('tr'): if tr.find('a') == None: continue metainfo = self.process_row(tr, order, dateobj) if metainfo: minfos.append(metainfo) for metainfo in minfos: href = metainfo.pop('href') url = urllib.basejoin(self.baseurl, href) relurl = os.path.join(relpath, metainfo['gznum']) if self.save_gazette(relurl, url, metainfo): dls.append(relurl) return dls
def download_orders_from_page(self, relpath, dateobj, webpage): newdls = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'HTML parsing failed for date: %s' % dateobj) return [] for tr in d.findAll('tr'): href = None for link in tr.findAll('a'): title = utils.get_tag_contents(link) if re.search('view\s+order', title, re.IGNORECASE): href = link.get('href') break if (not href): self.logger.warning(u'Could not process %s' % tr) continue words = href.split('/') filename = words[-1] url = urllib.basejoin(self.dateurl, href) self.logger.info(u'link: %s' % href) relurl = os.path.join (relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url) if not webpage: self.logger.warning(u'No webpage %s' % url) else: utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % url) if os.path.exists(filepath): newdls.append(relurl) if self.updateMeta or not os.path.exists(metapath): metainfo = self.get_meta_info(tr, dateobj) self.logger.info(u'relurl: %s metainfo: %s' % (relurl, metainfo)) if metainfo: utils.print_tag_file(metapath, metainfo) for link in d.findAll('a'): text = utils.get_tag_contents(link) href = link.get('href') if href and text and re.match('\s*next\s*$', text, re.IGNORECASE): url = urllib.basejoin(self.dateurl, href) webpage = self.download_url(url) if webpage: self.logger.info(u'Recursing to the nextpage: %s' % url) nextPageDls = self.download_orders_from_page(relpath, dateobj, webpage) newdls.extend(nextPageDls) else: self.logger.warning(u'Could not download the next webpage: %s' % url) return newdls
def download_oneday(self, relpath, dateobj): newdls = [] pageurl = urllib.basejoin(self.baseurl, '/gujarathc/') datestr = utils.dateobj_to_str(dateobj, '-') dateurl = pageurl + 'orderdatewisedata.jsp?fdate=%s&tdate=%s' % \ (datestr, datestr) webpage = self.download_url (dateurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'No webpage for %s' % dateurl) return newdls webpage = re.sub('(?P<windowopen>window.open\([^)]+\))', \ self.sanitize_windowopen, webpage) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: self.logger.info(u'No link in %s' % tr) continue href = link.get('onclick') if not href: self.logger.info(u'No href in %s' % tr) continue reobj = re.search("showoj.jsp?[^'\s]+", href) (start, end) = reobj.span() pagerelurl = href[start:end] url = urllib.basejoin(pageurl, pagerelurl) filename = utils.url_to_filename(url, False, ['caseyr', 'caseno', \ 'casetype']) if not filename: self.logger.error(u'Could not get filename for %s' % url) continue relurl = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): self.logger.info(u'Downloading %s %s' % (url, filename)) j = self.download_url(url, loadcookies = self.cookiefile.name) if not j: self.logger.warning(u'No webpage: %s' % url) else: self.logger.info(u'Saving %s' % filepath) utils.save_file(filepath, j) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(link, tr, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls