def get_judgment(self, url, relpath, metainfo): filename = utils.url_to_filename(url, False, ["yID", "nID", "ID"]) if not filename: self.logger.warning(u"No filename for %s" % url) return rel = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, rel) if os.path.exists(filepath): self.logger.info(u"Already exists %s" % filepath) else: self.logger.info(u"Downloading %s" % url) webpage = self.download_url(url, loadcookies=self.cookiefile.name) if not webpage: self.logger.warning(u"Could not download %s" % url) return utils.save_file(filepath, webpage) self.logger.info(u"Saved %s" % filepath) if os.path.exists(filepath): metapath = os.path.join(self.metadir, rel) if metainfo and (self.updateMeta or not os.path.exists(metapath)): utils.print_tag_file(metapath, metainfo) return rel
def get_judgment(self, relpath, judgment): relurl = None reobj = re.search('judgmentID=(?P<id>\d+)', judgment['link']) if not reobj: self.logger.warning(u'No judgment id in %s' % judgment['link']) else: judgmentId = reobj.groupdict()['id'] relurl = os.path.join(relpath, judgmentId) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): pdfdoc = self.download_url(judgment['link'], \ loadcookies = self.cookiefile.name) if pdfdoc: utils.save_file(filepath, pdfdoc) self.logger.info(u'Saved %s' % relurl) else: self.logger.info(u'Did not download %s' % judgment['link']) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): utils.print_tag_file(metapath, judgment['metainfo']) self.logger.info(u'Saved metainfo %s' % relurl) if not os.path.exists(filepath): relurl = None return relurl
def save_meta_tags(self, metapath, judgedict, dateobj): tagdict = {} if judgedict.has_key('title'): title = judgedict['title'] tagdict['title'] = title reobj = re.search('( vs | vs\.)', title, re.IGNORECASE) if reobj: if reobj.start() > 1: petitioner = title[:reobj.start()] tagdict['petitioner'] = petitioner if reobj.end() + 1 < len(title): respondent = title[reobj.end() + 1:] tagdict['respondent'] = respondent if judgedict.has_key('bench'): bench = judgedict['bench'].split(',') if len(bench) > 0: benchdict = {} benchdict['name'] = [] for judge in bench: benchdict['name'].append(judge) tagdict['bench'] = benchdict tagdict['date'] = utils.date_to_xml(dateobj) utils.print_tag_file(metapath, tagdict)
def handle_judgment_link(self, relpath, dateobj, tr): links = tr.findAll('a') if len(links) >= 1: href = links[-1].get('href') else: return None metainfo = self.get_meta_info(tr, dateobj) rel = '' if metainfo.has_key('caseno'): rel += metainfo['caseno'] else: if metainfo.has_key('petitioner'): rel += metainfo['petitioner'] if metainfo.has_key('respondent'): rel += metainfo['respondent'] if not rel: return None rel = string.replace(rel, '/', '-') tmprel = os.path.join(relpath, rel) filepath = os.path.join(self.rawdir, tmprel) if not os.path.exists(filepath): self.download_judgment(href, filepath) if os.path.exists(filepath): metapath = os.path.join(self.metadir, tmprel) if metainfo and (self.updateMeta or not os.path.exists(metapath)): utils.print_tag_file(metapath, metainfo) return tmprel else: return None
def get_judgment(self, url, relpath, metainfo): filename = utils.url_to_filename(url, False, ['yID', 'nID', 'ID']) if not filename: self.logger.warning(u'No filename for %s' % url) return rel = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, rel) if os.path.exists(filepath): self.logger.info(u'Already exists %s' % filepath) else: self.logger.info(u'Downloading %s' % url) webpage = self.download_url(url, loadcookies=self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download %s' % url) return utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % filepath) if os.path.exists(filepath): metapath = os.path.join(self.metadir, rel) if metainfo and (self.updateMeta or not os.path.exists(metapath)): utils.print_tag_file(metapath, metainfo) return rel
def download_oneday(self, relpath, dateobj): dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php') postdata = [('d1', dateobj.day), ('m1', dateobj.month), \ ('y1', dateobj.year), ('d2', dateobj.day), \ ('m2', dateobj.month), ('y2', dateobj.year), \ ('button', 'Submit')] webpage = self.download_url(dateurl, postdata = postdata) if not webpage: self.logger.warning(u'No webpage for %s date: %s' % \ (dateurl, dateobj)) return [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'HTML parsing failed for date: %s' % dateobj) return [] newdls = [] for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.warning(u'Could not process %s' % link) continue words = href.split('/') filename = words[-1] url = urllib.basejoin(dateurl, href) self.logger.info(u'link: %s title: %s' % (href, title)) relurl = os.path.join (relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url) if not webpage: self.logger.warning(u'No webpage %s' % url) else: utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % url) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(title, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls
def download_order(self, relpath, dateobj, metainfo, onclick): reobj = re.search('myfunViewDownLoad\s*\(\s*"(?P<ccin>\d+)"\s*,\s*"(?P<orderno>\d+)"\s*,\s*"(?P<flag>\w+)"\s*,\s*"(?P<casedetail>.+)"\s*,\s*"\w+"', onclick) if not reobj: self.logger.warning(u'Could not get parameters in onclick: %s' % onclick) return None groupdict = reobj.groupdict() ccin = groupdict['ccin'] orderno = groupdict['orderno'] flag = groupdict['flag'] casedetail = groupdict['casedetail'] metainfo['caseno'] = casedetail filename = self.get_filename(casedetail) if not filename: self.logger.warning(u'Could not get filename from %s' % casedetail) return None datestr = dateobj.__str__() utils.mk_dir(os.path.join(self.rawdir, self.name, datestr)) utils.mk_dir(os.path.join(self.metadir, self.name, datestr)) relurl = os.path.join(relpath, datestr, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if os.path.exists(filepath): self.logger.warning(u'Raw file already exists, skipping: %s ' % relurl) else: #ccin_no=001016200801769&order_no=2&flag=v&casedetail=MISC.CIVIL+APPLICATION%2F1769%2F2008&download_token_value_id=1367853726545 self.logger.info(u'Downloading %s' % relurl) postdata = [('ccin_no', ccin), ('order_no', orderno), \ ('flag', flag), ('casedetail', casedetail), \ ('download_token_value_id', int(time.time())) ] webpage = self.download_url(self.orderurl, \ referer=self.caseurl,\ loadcookies = self.cookiefile.name,\ postdata = postdata) if webpage: self.logger.info(u'Saving %s' % filepath) utils.save_file(filepath, webpage) else: self.logger.warning(u'Could not download ccin: %s number: %s ' % (ccin, orderno)) if os.path.exists(filepath) and metainfo and \ (self.updateMeta or not os.path.exists(metapath)): self.logger.info(u'Metainfo: %s' % metainfo) utils.print_tag_file(metapath, metainfo) if os.path.exists(filepath): return relurl return None
def handle_judgment_link(self, relpath, tr, dateobj, href, title): tmprel = os.path.join(relpath, re.sub('/', '-', title)) filepath = os.path.join(self.rawdir, tmprel) if not os.path.exists(filepath): self.get_judgment(href, filepath) if os.path.exists(filepath): metapath = os.path.join(self.metadir, tmprel) metainfo = self.parse_meta_info(tr, dateobj) if metainfo and (self.updateMeta or not os.path.exists(metapath)): utils.print_tag_file(metapath, metainfo) return tmprel else: return None
def get_judgment(self, relpath, url, filename, metainfo): relurl = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url, loadcookies=self.cookiefile.name) if not webpage: self.logger.warning(u"Could not download judgment %s" % url) return None utils.save_file(filepath, webpage) self.logger.info(u"Saved %s" % filepath) if os.path.exists(filepath): if self.updateMeta or not os.path.exists(metapath): metainfo["url"] = url utils.print_tag_file(metapath, metainfo) return relurl else: return None
def get_judgment(self, relpath, url, filename, metainfo): relurl = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url, loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download judgment %s' % url) return None utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % filepath) if os.path.exists(filepath): if self.updateMeta or not os.path.exists(metapath): metainfo['url'] = url utils.print_tag_file(metapath, metainfo) return relurl else: return None
def download_orders_from_page(self, relpath, dateobj, webpage): newdls = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'HTML parsing failed for date: %s' % dateobj) return [] for tr in d.findAll('tr'): href = None for link in tr.findAll('a'): title = utils.get_tag_contents(link) if re.search('view\s+order', title, re.IGNORECASE): href = link.get('href') break if (not href): self.logger.warning(u'Could not process %s' % tr) continue words = href.split('/') filename = words[-1] url = urllib.basejoin(self.dateurl, href) self.logger.info(u'link: %s' % href) relurl = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url) if not webpage: self.logger.warning(u'No webpage %s' % url) else: utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % url) if os.path.exists(filepath): newdls.append(relurl) if self.updateMeta or not os.path.exists(metapath): metainfo = self.get_meta_info(tr, dateobj) self.logger.info(u'relurl: %s metainfo: %s' % (relurl, metainfo)) if metainfo: utils.print_tag_file(metapath, metainfo) for link in d.findAll('a'): text = utils.get_tag_contents(link) href = link.get('href') if href and text and re.match('\s*next\s*$', text, re.IGNORECASE): url = urllib.basejoin(self.dateurl, href) webpage = self.download_url(url) if webpage: self.logger.info(u'Recursing to the nextpage: %s' % url) nextPageDls = self.download_orders_from_page( relpath, dateobj, webpage) newdls.extend(nextPageDls) else: self.logger.warning( u'Could not download the next webpage: %s' % url) return newdls
def download_oneday(self, relpath, dateobj): newdls = [] pageurl = urllib.basejoin(self.baseurl, '/gujarathc/') datestr = utils.dateobj_to_str(dateobj, '-') dateurl = pageurl + 'orderdatewisedata.jsp?fdate=%s&tdate=%s' % \ (datestr, datestr) webpage = self.download_url (dateurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'No webpage for %s' % dateurl) return newdls webpage = re.sub('(?P<windowopen>window.open\([^)]+\))', \ self.sanitize_windowopen, webpage) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: self.logger.info(u'No link in %s' % tr) continue href = link.get('onclick') if not href: self.logger.info(u'No href in %s' % tr) continue reobj = re.search("showoj.jsp?[^'\s]+", href) (start, end) = reobj.span() pagerelurl = href[start:end] url = urllib.basejoin(pageurl, pagerelurl) filename = utils.url_to_filename(url, False, ['caseyr', 'caseno', \ 'casetype']) if not filename: self.logger.error(u'Could not get filename for %s' % url) continue relurl = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): self.logger.info(u'Downloading %s %s' % (url, filename)) j = self.download_url(url, loadcookies = self.cookiefile.name) if not j: self.logger.warning(u'No webpage: %s' % url) else: self.logger.info(u'Saving %s' % filepath) utils.save_file(filepath, j) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(link, tr, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls
def save_meta_tags(self, metapath, judgedict, dateobj): tagdict = {'date': utils.date_to_xml(dateobj)} for k in judgedict.keys(): if k not in [self.HREF]: tagdict[k] = judgedict[k] utils.print_tag_file(metapath, tagdict)
def save_meta_tags(self, metapath, debatedict, dateobj): tagdict = {'date': utils.date_to_xml(dateobj)} for k in debatedict.keys(): if k not in ['href']: tagdict[k] = debatedict[k] utils.print_tag_file(metapath, tagdict)
def download_orders_from_page(self, relpath, dateobj, webpage): newdls = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'HTML parsing failed for date: %s' % dateobj) return [] for tr in d.findAll('tr'): href = None for link in tr.findAll('a'): title = utils.get_tag_contents(link) if re.search('view\s+order', title, re.IGNORECASE): href = link.get('href') break if (not href): self.logger.warning(u'Could not process %s' % tr) continue words = href.split('/') filename = words[-1] url = urllib.basejoin(self.dateurl, href) self.logger.info(u'link: %s' % href) relurl = os.path.join (relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url) if not webpage: self.logger.warning(u'No webpage %s' % url) else: utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % url) if os.path.exists(filepath): newdls.append(relurl) if self.updateMeta or not os.path.exists(metapath): metainfo = self.get_meta_info(tr, dateobj) self.logger.info(u'relurl: %s metainfo: %s' % (relurl, metainfo)) if metainfo: utils.print_tag_file(metapath, metainfo) for link in d.findAll('a'): text = utils.get_tag_contents(link) href = link.get('href') if href and text and re.match('\s*next\s*$', text, re.IGNORECASE): url = urllib.basejoin(self.dateurl, href) webpage = self.download_url(url) if webpage: self.logger.info(u'Recursing to the nextpage: %s' % url) nextPageDls = self.download_orders_from_page(relpath, dateobj, webpage) newdls.extend(nextPageDls) else: self.logger.warning(u'Could not download the next webpage: %s' % url) return newdls
def store_meta_tags(self, metapath, metainfo): utils.print_tag_file(metapath, metainfo)