def get_meta_info(self, tr): tds = tr.findAll('td') metainfo = {} link = tr.find('a') if link: href = link.get('href') if href: metainfo['href'] = href else: return metainfo valueList = [] for td in tds: value = utils.get_tag_contents(td) valueList.append(value) i = 0 for value in valueList: i += 1 if value: value = value.strip() if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER): pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet metainfo[self.CASENO] = valueList[i - 1] if res: metainfo[self.RESPONDENT] = res elif metainfo.has_key(self.PETITIONER): dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj # try one more heuristics if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): dateobj = utils.datestr_to_obj(metainfo['href']) if dateobj: metainfo[self.DATE] = dateobj if not metainfo.has_key(self.DATE) and \ not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner/date found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.DATE): self.logger.info(u'No date found: %s %s' % \ (metainfo, valueList)) return metainfo
def get_meta_info(self, tr): tds = tr.findAll('td') metainfo = {} link = tr.find('a') if link: href = link.get('href') if href: metainfo['href'] = href else: return metainfo valueList = [] for td in tds: value = utils.get_tag_contents(td) valueList.append(value) i = 0 for value in valueList: i += 1 if value: value = value.strip() if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER): pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet metainfo[self.CASENO] = valueList[i-1] if res: metainfo[self.RESPONDENT] = res elif metainfo.has_key(self.PETITIONER): dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj # try one more heuristics if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): dateobj = utils.datestr_to_obj(metainfo['href']) if dateobj: metainfo[self.DATE] = dateobj if not metainfo.has_key(self.DATE) and \ not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner/date found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.DATE): self.logger.info(u'No date found: %s %s' % \ (metainfo, valueList)) return metainfo
def get_meta_info(self, tr): metainfo = {} tds = tr.findAll('td') i = 0 lastcolumn = len(tds) - 1 for td in tds: content = utils.get_tag_contents(td) if content: if i == 1: content = u' '.join(content.split()) metainfo['caseno'] = content elif i == 2: petitioner, respondent = \ utils.get_petitioner_respondent(content) if petitioner: metainfo['petitioner'] = petitioner else: self.logger.info(u'Petitioner not found in %s' % content) if respondent: metainfo['respondent'] = respondent elif i == lastcolumn: dateobj = utils.datestr_to_obj(content) if dateobj: metainfo[self.DATE] = dateobj else: self.logger.info(u'No date in %s' % (content)) i += 1 return metainfo
def get_meta_info(self, tr): metainfo = {} tds = tr.findAll('td') for link in tr.findAll('a'): href = link.get('href') if href: metainfo['href'] = href break if not metainfo.has_key('href'): return {} i = 0 for td in tds: value = utils.get_tag_contents(td) if value: if i == 0: metainfo[self.CASENO] = value elif i == 1: pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet if res: metainfo[self.RESPONDENT] = res elif i == 2: dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj i += 1 return metainfo
def process_order_tr(self, ccin, relpath, dateobj, tr, fieldOrder): tds = tr.findAll('td') viewIndex = fieldOrder['view'] dateIndex = fieldOrder['date'] if viewIndex >= len(tds) or dateIndex >= len(tds): self.logger.warning(u'Could not get date or view in tr: %s' % tr) return None viewTd = tds[viewIndex] dateTd = tds[dateIndex] datestr = utils.get_tag_contents(dateTd) if not datestr: self.logger.warning(u'Date: %s Could not get date in %s' % (dateobj, tr)) return None subdateobj = utils.datestr_to_obj(datestr) if not subdateobj: self.logger.warning(u'Date: %s Could not get date in %s tr: %s' % (dateobj, datestr, tr)) return None subdateobj = subdateobj.date() metainfo = {'date':utils.date_to_xml(subdateobj), 'ccin': ccin} # store bench in metainfo if 'bench' in fieldOrder and fieldOrder['bench'] < len(tds): benchIndex = fieldOrder['bench'] benchTd = tds[benchIndex] contents = utils.get_tag_contents(benchTd) if contents: names = [] for reobj in re.finditer('JUSTICE ', contents): names.append(contents[reobj.end():]) if names: metainfo['bench'] = {} metainfo['bench']['name'] = names # store isJudgment in metainfo if 'judgment' in fieldOrder and fieldOrder['judgment'] < len(tds): jTd = tds[fieldOrder['judgment']] contents = utils.get_tag_contents(jTd) if contents: metainfo['judgment'] = contents onclick = viewTd.get('onclick') if onclick: relurl = self.download_order(relpath, subdateobj, \ metainfo, onclick) return relurl else: self.logger.warning(u'No onclick attribute in viewTd: %s' % viewTd) return None
def get_meta_info(self, tr, baseurl): metainfo = {} tds = tr.findAll('td') i = 0 for td in tds: value = utils.get_tag_contents(td) i += 1 if value: if i == 1: metainfo[self.CASENO] = value elif i == 2: pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet else: metainfo[self.PETITIONER] = value if res: metainfo[self.RESPONDENT] = res elif i == 3 or i == 4: dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj if not metainfo.has_key(self.DATE): self.logger.info(u'No date found %s' % metainfo) ms = [] if metainfo: self.logger.debug(u'metainfo: %s' % metainfo) links = tr.findAll('a') for link in links: href = link.get('href') if href: m = metainfo.copy() m['href'] = href m['url'] = urllib.basejoin(baseurl, href) ms.append(m) return ms