Exemplo n.º 1
0
    def get_judgment_info(self, tr):
        judgedict = {}
        link = tr.find('a')
        if link:
            title = utils.get_tag_contents(link)
            href  = link.get('href')

            if href:
                judgedict['href']  = href

        tds = tr.findAll('td')
        i = 0
        for td in tds:
            tdContent = utils.get_tag_contents(td)
            if tdContent:
                if i == 0:
                    judgedict['casetype'] = tdContent
                elif i == 1:
                    judgedict['caseno'] = tdContent
                elif i == 2:
                    judgedict['caseyear'] = tdContent
                i += 1
        if judgedict.has_key('caseno') and judgedict.has_key('caseyear'):
            title = u'%sof%s' % (judgedict['caseno'], judgedict['caseyear'])
            judgedict['title'] = title
        return judgedict
Exemplo n.º 2
0
    def get_judgment_info(self, tr):
        judgedict = {}
        link = tr.find('a')
        if link:
            title = utils.get_tag_contents(link)
            href  = link.get('href')

            if href:
                judgedict['href']  = href

        tds = tr.findAll('td')
        i = 0
        for td in tds:
            tdContent = utils.get_tag_contents(td)
            if tdContent:
                if i == 0:
                    judgedict['casetype'] = tdContent
                elif i == 1:
                    judgedict['caseno'] = tdContent
                elif i == 2:
                    judgedict['caseyear'] = tdContent
                i += 1
        if judgedict.has_key('caseno') and judgedict.has_key('caseyear'):
            title = u'%sof%s' % (judgedict['caseno'], judgedict['caseyear'])
            judgedict['title'] = title
        return judgedict
Exemplo n.º 3
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        download = None
        for link in tr.find_all('a'):
            txt = utils.get_tag_contents(link)
            if txt and re.match('\s*Download', txt, re.IGNORECASE):
                download = link.get('href')
                break

        if not download:
            return

        metainfo = utils.MetaInfo()
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)
        metainfo['download'] = download

        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                else:
                    continue

                if col == 'gznum':
                    metainfo['gznum'] = txt.splitlines()[0]

                elif col in ['subject', 'department', 'notification_num', \
                             'gztype']:
                    metainfo[col] = txt

            i += 1
Exemplo n.º 4
0
    def get_meta_info(self, link, tr, dateobj):
        metainfo = {'date':utils.date_to_xml(dateobj)}
        metainfo['caseno'] = utils.get_tag_contents(link)
        tds = tr.findAll('td')
        for td in tds:
            contents = utils.get_tag_contents(td)
            reobj = re.search('JUSTICE ', contents)
            if  reobj:
               metainfo['author'] = contents[reobj.end():]

        return metainfo
Exemplo n.º 5
0
    def process_order_tr(self, ccin, relpath, dateobj, tr, fieldOrder):
        tds =  tr.findAll('td')
        viewIndex  = fieldOrder['view']
        dateIndex  = fieldOrder['date']
        if viewIndex >= len(tds) or dateIndex >= len(tds):
            self.logger.warning(u'Could not get date or view in tr: %s' % tr)
            return None

        viewTd  = tds[viewIndex]
        dateTd  = tds[dateIndex]

        datestr = utils.get_tag_contents(dateTd)

        if not datestr:
            self.logger.warning(u'Date: %s Could not get date in %s' % (dateobj, tr))
            return None

        subdateobj = utils.datestr_to_obj(datestr)
        if not subdateobj:
            self.logger.warning(u'Date: %s Could not get date in %s tr: %s' % (dateobj, datestr, tr))
            return None

        subdateobj = subdateobj.date() 
        metainfo = {'date':utils.date_to_xml(subdateobj), 'ccin': ccin}

        # store bench in metainfo
        if 'bench' in fieldOrder and fieldOrder['bench'] < len(tds):
            benchIndex = fieldOrder['bench']
            benchTd = tds[benchIndex]
            contents = utils.get_tag_contents(benchTd)
            if contents:
                names = []
                for reobj in re.finditer('JUSTICE ', contents):
                    names.append(contents[reobj.end():])
                if names:
                    metainfo['bench'] = {} 
                    metainfo['bench']['name'] = names

        # store isJudgment in metainfo
        if 'judgment' in fieldOrder and fieldOrder['judgment'] < len(tds):
            jTd = tds[fieldOrder['judgment']]
            contents = utils.get_tag_contents(jTd)
            if contents:
                metainfo['judgment'] = contents

        onclick  = viewTd.get('onclick')
        if onclick:
            relurl = self.download_order(relpath, subdateobj, \
                                             metainfo, onclick)
            return relurl
        else:
             self.logger.warning(u'No onclick attribute in viewTd: %s' % viewTd)
        return None 
Exemplo n.º 6
0
    def result_page(self, webpage, relpath, dateobj):
        newdls = []

        if not webpage:
            return newdls

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(
                u'Could not parse html of the result page for date %s' %
                dateobj)
            return newdls

        trs = d.findAll('tr')

        for tr in trs:
            link = tr.find('a')

            if not link:
                continue

            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.info(u'Could not process %s' % link)
                continue

            if not re.match('\d+$', title) and not re.search(
                    'PREV|NEXT', title):
                self.logger.info(u'link: %s title: %s' % (href, title))
                rel = self.handle_judgment_link(relpath, tr, dateobj, href,
                                                title)
                if rel:
                    newdls.append(rel)

        if newdls:
            links = d.findAll('a')
            for link in links:
                href = link.get('href')
                title = utils.get_tag_contents(link)
                if title and href and re.match('NEXT', title):
                    self.logger.info(u'Following next page link: %s' % link)
                    webpage = self.download_url(urllib.basejoin(self.baseurl,href),\
                                                loadcookies = self.cookiefile.name)

                    newdls.extend(self.result_page(webpage, relpath, dateobj))
        return newdls
Exemplo n.º 7
0
    def get_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj)}

        tds = tr.findAll('td')

        if len(tds) >= 3:
            metainfo['caseno'] = utils.get_tag_contents(tds[2])

        if len(tds) >= 4:
            metainfo['petitioner'] = utils.get_tag_contents(tds[3])

        if len(tds) >= 5:
            metainfo['respondent'] = utils.get_tag_contents(tds[4])

        return metainfo
Exemplo n.º 8
0
    def datepage_metainfos(self, url, dateobj):
        minfos = []
        response = self.download_url(url)

        if not response or not response.webpage:
            self.logger.warn('Unable to download %s. Skipping', url)
            return minfos

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn('Unable to parse %s. Skipping.', url)
            return minfos

        partnum = None
        dept    = None
        for td in d.find_all('td'):
            bgcolor = td.get('bgcolor')
            links   = td.find_all('a')
            if bgcolor == '#91BAE8' and len(links) == 0:
                partnum =  utils.get_tag_contents(td)
                partnum  = utils.remove_spaces(partnum)
                dept    = None
            elif len(links) > 0:
                reobj  = re.compile('^(strong|a)$')
                for x in td.find_all(reobj):
                    if x.name == 'strong':
                        dept = utils.get_tag_contents(x)
                        dept = utils.remove_spaces(dept)
                    elif x.name == 'a'  and partnum:
                        href  = x.get('href')
                        if not href.startswith('pdf'):
                            continue

                        title = utils.get_tag_contents(x)
                        title = utils.remove_spaces(title)

                        metainfo = utils.MetaInfo()
                        minfos.append(metainfo)

                        metainfo.set_title(title)
                        metainfo.set_date(dateobj)     
                        metainfo['partnum'] = partnum
                        if dept:
                            metainfo['department']    = dept
                        gzurl = urllib.basejoin(url, href)
                        metainfo['url'] = gzurl

        return minfos    
Exemplo n.º 9
0
    def parse_result_page(self, posturl, webpage, dateobj):
        judgments = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)
            return judgments

        # get judgments
        trs = d.findAll('tr')
        for tr in trs:
            judgment = {}
            metainfo = {'date': utils.date_to_xml(dateobj)}

            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href and re.search('WebShowJudgment.do', href):
                    t = utils.get_tag_contents(link)
                    colon = t.find(':')
                    if colon:
                        title = t[colon + 1:]
                        title = title.strip()
                        metainfo['title'] = title
                        reobj = re.search(' vs\. ', title, re.IGNORECASE)
                        if reobj:
                            metainfo['petitioner'] = title[:reobj.start()]
                            metainfo['respondent'] = title[reobj.end():]
                if href and re.search('WebDownloadJudgmentDocument.do', href):
                    judgment['link'] = urllib.basejoin(posturl, href)

            if judgment:
                judgment['metainfo'] = metainfo
                judgments.append(judgment)

        # next link
        links = d.findAll('a')
        for link in links:
            t = utils.get_tag_contents(link)
            if re.search('Next', t):
                href = link.get('href')

                if href:
                    judgment = {'link': urllib.basejoin(posturl, href)}
                    judgment['next'] = True

                judgments.append(judgment)

        return judgments
Exemplo n.º 10
0
    def parse_result_page(self, posturl, webpage, dateobj):
        judgments = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)
            return judgments

        # get judgments
        trs = d.findAll('tr')
        for tr in trs:
            judgment = {}
            metainfo = { 'date': utils.date_to_xml(dateobj)}

            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href and re.search('WebShowJudgment.do', href):
                    t = utils.get_tag_contents(link)
                    colon = t.find(':')
                    if colon:
                        title = t[colon+1:]
                        title = title.strip()
                        metainfo['title'] = title
                        reobj = re.search(' vs\. ', title, re.IGNORECASE)
                        if reobj:
                            metainfo['petitioner'] = title[:reobj.start()]
                            metainfo['respondent'] = title[reobj.end():]
                if href and re.search('WebDownloadJudgmentDocument.do', href):
                    judgment['link'] = urllib.basejoin(posturl, href)
 
            if judgment:
                judgment['metainfo'] = metainfo
                judgments.append(judgment)
        
        # next link
        links = d.findAll('a')
        for link in links: 
            t = utils.get_tag_contents(link)          
            if re.search('Next', t):
                href = link.get('href')
             
                if href:
                    judgment = {'link': urllib.basejoin(posturl, href)}
                    judgment['next'] = True
                   
                judgments.append(judgment)
 
        return judgments
Exemplo n.º 11
0
    def get_field_order(self, tr):
        i = 0
        order = []
        valid = False
        for th in tr.find_all('th'):
            txt = utils.get_tag_contents(th)
            if txt and re.search('gazette\s+type', txt, re.IGNORECASE):
                order.append('gztype')
            elif txt and re.search('department', txt, re.IGNORECASE):
                order.append('department')
            elif txt and re.search('abstract', txt, re.IGNORECASE):
                order.append('subject')
            elif txt and re.search('Issue\s+No', txt, re.IGNORECASE):
                order.append('gznum')
            elif txt and re.search('Notification\s+No', txt, re.IGNORECASE):
                order.append('notification_num')
            elif txt and re.search('Download', txt, re.IGNORECASE):
                order.append('download')
                valid = True
            elif txt and re.search('', txt, re.IGNORECASE):
                order.append('')

            else:
                order.append('')

            i += 1
        if valid:
            return order
        return None
Exemplo n.º 12
0
    def parse_row(self, tr, order, dateobj):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)

        i = 0
        for td in tr.find_all('td'):
            txt = utils.get_tag_contents(td)
            if i < len(order) and txt:
                txt = txt.strip()
                col = order[i]
                if col == 'gztype':
                    words = txt.split('/')
                    metainfo['gztype'] = words[0].strip()
                    if len(words) > 1:
                        metainfo['partnum'] = words[1].strip()
                    if len(words) > 2:
                        metainfo['district'] = words[2].strip()
                elif col == 'download':
                    inp = td.find('input')
                    if inp and inp.get('onclick'):
                        metainfo['download'] = inp.get('onclick')
                elif col in ['notification_num', 'gznum', 'department']:
                    metainfo[col] = txt
                elif col == 'subject':
                    metainfo.set_subject(txt)
            i += 1
        return metainfo
Exemplo n.º 13
0
    def get_judgment_info(self, tr):
        judgedict = {}
        if tr.findAll("table"):
            return {}

        link = tr.find("a")
        if link:
            href = link.get("href")
            if href:
                judgedict["href"] = href

        tds = tr.findAll("td")
        i = 0
        caseno = ""
        for td in tds:
            i += 1
            txt = utils.get_tag_contents(td)
            reobj = re.search("JUSTICE ", txt)
            if reobj:
                author = txt[reobj.end() :]
                if author:
                    judgedict["author"] = author
            elif i == 2:
                judgedict["casetype"] = txt
            elif i == 3:
                caseno += txt
            elif i == 4:
                caseno += "/%s" % txt
        if caseno:
            judgedict["caseno"] = caseno
            judgedict["title"] = caseno

        return judgedict
Exemplo n.º 14
0
    def get_order_of_fields(self, table):
        fieldOrder = {}
        thead = table.find('thead')
        if not thead:
            return fieldOrder

        ths = thead.findAll('th')
        i = 0
        for th in ths:
            text = utils.get_tag_contents(th)
            
            if text:
                if re.search('CASEDETAIL', text):
                    fieldOrder['caseno'] = i
                elif re.search('JUDGE NAME', text):
                    fieldOrder['bench'] = i
                elif re.search('DATE', text):
                    fieldOrder['date'] = i
                elif re.search('VIEW', text):
                    fieldOrder['view'] = i
                elif re.search('JUDGEMENT', text):
                    fieldOrder['judgment'] = i

            i += 1
        return fieldOrder
Exemplo n.º 15
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)

        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                else:
                    continue

                if col == 'gztype':
                    metainfo.set_gztype(txt)
                elif col == 'download':
                    link = td.find('a')
                    if link:
                        href = link.get('href')
                        if href:
                            metainfo['download'] = href
                elif col in ['partnum', 'division', 'subject']:
                    metainfo[col] = txt
  
            i += 1
        if 'download' not in metainfo:
            self.logger.warn('No download link, ignoring: %s', tr)
        else:
            metainfos.append(metainfo)
Exemplo n.º 16
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        tds = tr.find_all('td')
        if len(tds) != len(order):
            return

        metainfo = utils.MetaInfo()
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)

        i = 0
        for td in tds:
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                else:
                    continue

                if col == 'gztype':
                    metainfo.set_gztype(txt)

                elif col == 'gznum':
                    metainfo['gznum'] = txt
                    link = td.find('a')
                    if link and link.get('href'):
                        metainfo['download'] = link.get('href')

            i += 1
Exemplo n.º 17
0
    def parse_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj)}

        tds = tr.findAll('td')
        i = 0
        for td in tds:
            c = utils.get_tag_contents(td)
            if c:
                if i == 0:
                    contents = utils.tag_contents_without_recurse(td)
                    names = []
                    for content in contents:
                        reobj = re.search('JUSTICE ', content)
                        if reobj:
                            names.append(content[reobj.end():])

                    if names: 
                        metainfo['bench'] = {}
                        metainfo['bench']['name']  = names

                elif i == 1:
                    metainfo['category'] = c
                elif i == 3:
                    metainfo['caseno']   = c
                    
                i += 1

        return metainfo
Exemplo n.º 18
0
    def process_result_page(self, relpath, dateobj, webpage):
        newdls = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.info(u'Could not parse result page for date %s' %
                             dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            pagetype = self.page_type(tr)
            if pagetype == 'nextlink':
                nextlink = self.next_link(tr.findAll('a'))
                if nextlink:
                    self.logger.info(u'Going to the next page: %s' \
                                       % utils.get_tag_contents(nextlink))

                    rels = self.process_next_link(relpath, dateobj, nextlink)
                    newdls.extend(rels)
            elif pagetype == 'judgment':
                rel = self.handle_judgment_link(relpath, dateobj, tr)
                if rel:
                    newdls.append(rel)
            else:
                self.logger.info(u'Not processing %s' % tr)
        return newdls
Exemplo n.º 19
0
    def handle_result_page(self, resultpage, relpath, dateobj):
        dls = []
        d = utils.parse_webpage(resultpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)

        # download judgments
        trs = d.findAll('tr')
        for tr in trs:
            links = tr.findAll('a')
            if len(links) == 1:
                relurl = self.dl_judgment(relpath, tr, links[0], dateobj)
                if relurl:
                    dls.append(relurl)
            else:
                self.logger.warning(u'No action for %s' % tr)

        # next page
        links = d.findAll('a')
        for link in links:
            href = link.get('href')
            t = utils.get_tag_contents(link)
            if href and t == 'Next':
                nexturl = urllib.basejoin(self.resulturl, href)  
                resultpage = self.download_url(nexturl, \
                                            loadcookies = self.cookiefile.name)
                if resultpage:
                    self.logger.info(u'Recursing  to %s' % nexturl)
                    dls.extend(self.handle_result_page(resultpage, relpath, \
                                                       dateobj))
        return dls  
Exemplo n.º 20
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        metainfo = utils.MetaInfo()
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)

        i        = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()

                if col == 'ministry':
                    metainfo.set_ministry(txt)
                elif col == 'subject':
                    metainfo.set_subject(txt)
                elif col == 'gztype':
                    metainfo.set_gztype(txt)    
                elif col == 'download':
                    inp = td.find('input')
                    if inp:
                        name = inp.get('name')
                        if name:
                            metainfo[col] = name
                    else:
                        link = td.find('a')
                        if link:
                            metainfo[col] = link 
                                            
                elif col in ['office', 'department', 'partnum', 'refnum']:
                    metainfo[col] = txt
            i += 1
Exemplo n.º 21
0
    def get_judgment_info(self, tr):
        judgedict = {}
        if tr.findAll('table'):
            return {}

        link = tr.find('a')
        if link:
            href = link.get('href')
            if href:
                judgedict['href'] = href

        tds = tr.findAll('td')
        i = 0
        caseno = ''
        for td in tds:
            i += 1
            txt = utils.get_tag_contents(td)
            reobj = re.search('JUSTICE ', txt)
            if reobj:
                author = txt[reobj.end():]
                if author:
                    judgedict['author'] = author
            elif i == 2:
                judgedict['casetype'] = txt
            elif i == 3:
                caseno += txt
            elif i == 4:
                caseno += '/%s' % txt
        if caseno:
            judgedict['caseno'] = caseno
            judgedict['title'] = caseno

        return judgedict
Exemplo n.º 22
0
    def download_info_page(self, url):
        webpage = self.download_url(url)
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse the date search page')
            return [], None

        links = d.findAll('a')
        infolist = []
        previousurl = None
        for link in links:
            href = link.get('href')

            if previousurl == None and href:
                anchortext = utils.get_tag_contents(link)
                if anchortext and re.search('Previous >>', anchortext):
                    previousurl = urllib.basejoin(url, href)

            if href:
                if re.match('judgements', href):
                    node = link
                    while node.name != 'tr':
                        node = node.parent
                    if node:
                        metainfo = self.get_meta_info(node)
                        metainfo['href'] = href
                        infolist.append(metainfo)
                        self.logger.debug('metainfo: %s' % metainfo)

        return infolist, previousurl
Exemplo n.º 23
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}

        for td in tr.findAll('td'):
            text = utils.get_tag_contents(td)
            if text:
                reobj = re.search('\s+vs\s+', text, re.IGNORECASE)
                if reobj:
                    caseReobj = re.search('(?P<num>\d+)\s+of\s+(?P<year>\d+)', text, re.IGNORECASE)
                    if caseReobj and caseReobj.end() < reobj.start():
                        groupdict = caseReobj.groupdict()
                        metainfo['caseno'] = u'%s/%s' % (groupdict['num'], groupdict['year'])
                       
                        petitioner = text[caseReobj.end():reobj.start()]
                    else:
                        petitioner = text[:reobj.start()]

                    if reobj.end() < len(text):
                        respondent = text[reobj.end():]
                        metainfo['respondent'] = respondent.strip()

                    metainfo['petitioner'] = petitioner.strip()
                    break
                    
        return metainfo
Exemplo n.º 24
0
 def get_meta_info(self, tr):
     metainfo = {}
     tds = tr.findAll('td')
    
     for link in tr.findAll('a'):
         href = link.get('href')
         if href:
             metainfo['href'] = href
             break
     if not metainfo.has_key('href'):
         return {}
     i = 0
     for td in tds:
         value = utils.get_tag_contents(td)
         if value:
             if i == 0:
                 metainfo[self.CASENO] = value
             elif i == 1:
                 pet, res = utils.get_petitioner_respondent(value)
                 if pet:
                     metainfo[self.PETITIONER] = pet
                 if res:
                     metainfo[self.RESPONDENT] = res
             elif i == 2:
                 dateobj = utils.datestr_to_obj(value)
                 if dateobj:
                     metainfo[self.DATE] = dateobj
             i += 1
     return metainfo
Exemplo n.º 25
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        metainfo = utils.MetaInfo()
        metainfo.set_gztype(self.gazette_type)
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)
        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                if col == 'subject':
                    metainfo.set_subject(txt)
                elif col == 'gznum':
                    reobj = re.search('\w+', txt)
                    if reobj:
                        metainfo['gznum'] = txt[reobj.start():reobj.end()]

                elif col == 'notification_date':
                    d = utils.parse_datestr(txt)
                    if d:
                        metainfo[col] = d

                elif col in ['department', 'notification_num']:
                    metainfo[col] = txt
                elif col == 'download':
                    inp = tr.find('input')
                    if inp:
                        name = inp.get('name')
                        if name:
                            metainfo[col] = name

            i += 1
Exemplo n.º 26
0
 def get_debate_info(self, table):
     info = {}
     trs = table.findAll('tr')
     for tr in trs:
         tds = tr.findAll('td')
         if len(tds) == 2:
             hl     = utils.get_tag_contents(tds[0])
             value  = utils.get_tag_contents(tds[1])
             hltype = self.get_headline_type(hl)
             if hltype:
                 info[hltype] = value
                 if hltype == 'title':
                     href = self.get_link(tds[1])
                     if href:
                         info['href'] = href
     return info
Exemplo n.º 27
0
    def process_row(self, tr, order, dateobj):
        metainfo = utils.MetaInfo()
        i = 0
        for td in tr.find_all('td'):
            txt = utils.get_tag_contents(td)
            if txt and order[i] in ['subject', 'department', 'gznum']:
                txt, n = re.subn('\s+', ' ', txt)
                metainfo[order[i]] = txt.strip()
            elif txt and order[i] == 'date':
                nums = re.split('[./-]+', txt)
                if len(nums) < 3:
                    self.logger.warn(
                        'Couldn\'t get date from %s for extraordinary gazette list',
                        txt)
                    i += 1
                    continue

                nums = [re.subn('\s+', '', n)[0] for n in nums]
                nums = [n for n in nums if n]
                d = datetime.date(int(nums[2]), int(nums[1]), int(nums[0]))
                try:
                    metainfo.set_date(d)
                except:
                    self.logger.warn('Could not parse date %s', txt)

            i += 1
        if metainfo.get_date() == dateobj:
            metainfo.set_gztype('Extraordinary')
            return metainfo

        return None
Exemplo n.º 28
0
    def process_judgment_page(self, relpath, url, dateobj):
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)
        if not webpage:
            self.logger.warning(u'Could not download %s' % url)
            return None

        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.warning(u'Could not parse %s' % url)
            return None

        metainfo = self.get_meta_info(d, dateobj)

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            action = self.action_on_link(href, title)
            newurl = urllib.basejoin(url, href)
            if action == 'save':
                self.logger.info(u'Downloading %s' % newurl)
                return self.get_judgment(relpath, newurl, title, metainfo)

        return None
Exemplo n.º 29
0
    def process_result_page(self, relpath, dateobj, webpage):
        newdls = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.info(u'Could not parse result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            pagetype = self.page_type(tr)
            if pagetype == 'nextlink':
                nextlink =  self.next_link(tr.findAll('a'))
                if nextlink:
                    self.logger.info(u'Going to the next page: %s' \
                                       % utils.get_tag_contents(nextlink))

                    rels = self.process_next_link(relpath, dateobj, nextlink)
                    newdls.extend(rels)
            elif pagetype == 'judgment':
                rel = self.handle_judgment_link(relpath, dateobj, tr)
                if rel:
                    newdls.append(rel)
            else:
                self.logger.info(u'Not processing %s' % tr)
        return newdls
Exemplo n.º 30
0
    def result_page(self, relpath, url, dateobj, linkdict):
        newdls = []
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title) or linkdict.has_key(href):
                self.logger.warning(u'Could not process %s' % link)
                continue

            linkdict[href] = 1

            action = self.action_on_link(href, title)
            self.logger.info(u'Action %s on link %s title %s' %\
                                     (action, href, title))       
            newurl = urllib.basejoin(url, href)
            if action == 'judgmentlink':
                relurl = self.process_judgment_page(relpath, newurl, dateobj)
                if relurl:
                    newdls.append(relurl)
                else:
                    self.logger.warning(u'Judgment link not working %s' % newurl)
            elif action == 'recurse':
                newdls.extend(self.result_page(relpath, newurl, dateobj, 
                                               linkdict))
           
        return newdls
Exemplo n.º 31
0
    def find_field_order(self, tr):
        order = []
        for td in tr.find_all('td'):
            txt = utils.get_tag_contents(td)
            if txt and re.search('Department', txt):
                order.append('department')
            elif txt and re.search('Notification\s+Number', txt):
                order.append('notification_num')
            elif txt and re.search('Gazette\s+Number', txt):
                order.append('gznum')
            elif txt and re.search('Subject', txt):
                order.append('subject')
            elif txt and re.search('File', txt):
                order.append('download')
            elif txt and re.search('Gazette\s+Date', txt):
                order.append('gzdate')
            else:
                order.append('')

        for field in [
                'department', 'download', 'subject', 'gznum',
                'notification_num'
        ]:
            if field not in order:
                return None
        return order
Exemplo n.º 32
0
    def download_info_page(self, url):
        webpage = self.download_url(url)
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse the date search page')
            return [], None

        links = d.findAll('a')
        infolist = []
        previousurl = None
        for link in links:
            href = link.get('href')

            if previousurl == None and href:
                anchortext = utils.get_tag_contents(link)
                if anchortext and re.search('Previous >>', anchortext):
                    previousurl = urllib.basejoin(url, href)

            if href:
                 if re.match('judgements', href):
                     node = link 
                     while node.name != 'tr':
                         node = node.parent
                     if node:
                        metainfo = self.get_meta_info(node)
                        metainfo['href'] = href
                        infolist.append(metainfo)
                        self.logger.debug('metainfo: %s' % metainfo)

        return infolist, previousurl
Exemplo n.º 33
0
    def result_page(self, relpath, url, dateobj, linkdict):
        newdls = []
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title) or linkdict.has_key(href):
                self.logger.warning(u'Could not process %s' % link)
                continue

            linkdict[href] = 1

            action = self.action_on_link(href, title)
            self.logger.info(u'Action %s on link %s title %s' %\
                                     (action, href, title))       
            newurl = urllib.basejoin(url, href)
            if action == 'judgmentlink':
                relurl = self.process_judgment_page(relpath, newurl, dateobj)
                if relurl:
                    newdls.append(relurl)
                else:
                    self.logger.warning(u'Judgment link not working %s' % newurl)
            elif action == 'recurse':
                newdls.extend(self.result_page(relpath, newurl, dateobj, 
                                               linkdict))
           
        return newdls
Exemplo n.º 34
0
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        i = 0
        lastcolumn = len(tds) - 1
        for td in tds:
            content = utils.get_tag_contents(td)
            if content:
                if i == 1:
                    content = u' '.join(content.split())
                    metainfo['caseno'] = content
                elif i == 2:
                    petitioner, respondent = \
                            utils.get_petitioner_respondent(content)
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    else:
                        self.logger.info(u'Petitioner not found in %s' %
                                         content)
                    if respondent:
                        metainfo['respondent'] = respondent
                elif i == lastcolumn:
                    dateobj = utils.datestr_to_obj(content)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
                    else:
                        self.logger.info(u'No date in %s' % (content))
                i += 1
        return metainfo
Exemplo n.º 35
0
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        i = 0
        lastcolumn = len(tds) - 1
        for td in tds:
            content = utils.get_tag_contents(td)
            if content:
                if i == 1:
                    content = u' '.join(content.split())
                    metainfo['caseno'] = content
                elif i == 2:
                    petitioner, respondent = \
                            utils.get_petitioner_respondent(content)
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    else:
                        self.logger.info(u'Petitioner not found in %s' % content)
                    if respondent:
                        metainfo['respondent'] = respondent 
                elif i == lastcolumn:
                    dateobj = utils.datestr_to_obj(content)
                    if dateobj:
                       metainfo[self.DATE] = dateobj
                    else:
                        self.logger.info(u'No date in %s' % (content))
                i += 1
        return metainfo
Exemplo n.º 36
0
    def get_judgment_info(self, tr):
        judgedict = {}
        if tr.findAll('table'):
            return {}

        link = tr.find('a')
        if link:
            href = link.get('href')
            if href:
                judgedict['href'] = href

        tds = tr.findAll('td')
        i = 0
        caseno = ''
        for td in tds:
            i += 1
            txt = utils.get_tag_contents(td)
            reobj = re.search('JUSTICE ', txt)
            if reobj:
                author = txt[reobj.end():]
                if author:
                    judgedict['author'] = author
            elif i == 2:
                judgedict['casetype'] = txt 
            elif i == 3:
                caseno += txt
            elif i == 4:
                caseno += '/%s' % txt
        if caseno:
            judgedict['caseno'] = caseno
            judgedict['title'] = caseno

        return judgedict
Exemplo n.º 37
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        gznum = None
        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                if col == 'gznum':
                    gznum = txt
                elif col.startswith('partnum'):
                    h, partnum = col.split('|')
                    metainfo = utils.MetaInfo()
                    metainfos.append(metainfo)
                    metainfo.set_date(dateobj)
                    metainfo.set_gztype(self.gazette_type)

                    if gznum:
                        metainfo['gznum'] = gznum
                    metainfo['partnum'] = partnum
                    inp = td.find('input')
                    if inp:
                        name = inp.get('name')
                        if name:
                            metainfo['download'] = name
            i += 1
Exemplo n.º 38
0
    def get_judgment_info(self, tr):
        links = tr.findAll('a')
        judgedict = {}
        for link in links:
            href = link.get('href')
            if href and re.search('imgst.aspx', href):
                judgedict['href'] = urllib.basejoin(self.webformUrl, href)

        tds = tr.findAll('td')
        maxTxt = ''
        for td in tds:
            txt = utils.get_tag_contents(td)
            if not txt:
                continue
            txt = txt.strip()
            reobj = re.search('Coram\s*:', txt)
            if reobj and reobj.end() + 1 < len(txt):
                bench = txt[reobj.end() + 1:]
                judgedict['bench'] = bench
            else:
                reobj = re.search(' vs\.? ', txt, re.IGNORECASE)
                if reobj:
                    judgedict['title'] = txt
                    judgedict['petitioner'] = txt[:reobj.start()] 
                    judgedict['respondent'] = txt[reobj.end():]
                elif len(maxTxt) < len(txt):
                    maxTxt = txt
        if not judgedict.has_key('title') and maxTxt:
            judgedict['title'] = maxTxt            
        return judgedict
Exemplo n.º 39
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}

        for td in tr.findAll('td'):
            text = utils.get_tag_contents(td)
            if text:
                reobj = re.search('\s+vs\s+', text, re.IGNORECASE)
                if reobj:
                    caseReobj = re.search('(?P<num>\d+)\s+of\s+(?P<year>\d+)',
                                          text, re.IGNORECASE)
                    if caseReobj and caseReobj.end() < reobj.start():
                        groupdict = caseReobj.groupdict()
                        metainfo['caseno'] = u'%s/%s' % (groupdict['num'],
                                                         groupdict['year'])

                        petitioner = text[caseReobj.end():reobj.start()]
                    else:
                        petitioner = text[:reobj.start()]

                    if reobj.end() < len(text):
                        respondent = text[reobj.end():]
                        metainfo['respondent'] = respondent.strip()

                    metainfo['petitioner'] = petitioner.strip()
                    break

        return metainfo
Exemplo n.º 40
0
    def process_row(self, tr, order, dateobj):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)
        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                txt = utils.get_tag_contents(td)
                txt = txt.strip()
                if order[i] in [
                        'gznum', 'department', 'notification_num', 'subject'
                ]:
                    metainfo[order[i]] = txt
                elif order[i] == 'gzdate':
                    nums = re.findall('\d+', txt)
                    if len(nums) == 3:
                        try:
                            d = datetime.date(int(nums[2]), int(nums[1]),
                                              int(nums[0]))
                            metainfo['gzdate'] = d
                        except:
                            self.logger.warn('Unable to form date for %s', txt)
                elif order[i] == 'download':
                    link = td.find('a')
                    if link and link.get('href'):
                        metainfo['href'] = link.get('href')

            i += 1
        if 'href' in metainfo and 'gznum' in metainfo:
            return metainfo
        return None
Exemplo n.º 41
0
    def process_judgment_page(self, relpath, url, dateobj):
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)
        if not webpage:
            self.logger.warning(u'Could not download %s' % url)
            return None

        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.warning(u'Could not parse %s' % url)
            return None

        metainfo = self.get_meta_info(d, dateobj)

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            action = self.action_on_link(href, title)
            newurl = urllib.basejoin(url, href)
            if action == 'save':
                self.logger.info(u'Downloading %s' % newurl)
                return self.get_judgment(relpath, newurl, title, metainfo)

        return None
Exemplo n.º 42
0
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        for link in tr.findAll('a'):
            href = link.get('href')
            if href:
                metainfo['href'] = href
                break
        if not metainfo.has_key('href'):
            return {}
        i = 0
        for td in tds:
            value = utils.get_tag_contents(td)
            if value:
                if i == 0:
                    metainfo[self.CASENO] = value
                elif i == 1:
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif i == 2:
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
                i += 1
        return metainfo
Exemplo n.º 43
0
    def parse_metainfos(self, webpage, year, fromdate, todate):
        minfos = []
        nextpage = None

        d = utils.parse_webpage(webpage, self.parser)
        if not d:
            self.logger.warn('Unable to parse results page for year %d', year)
            return minfos

        for td in d.find_all('td'):
            link = td.find('a')
            if link == None:
                continue
            img = td.find('img')
            if img:
                title = img.get('title')
                if title == 'Next' and nextpage == None:
                    nextpage = link
                continue

            metainfo = self.get_metainfo(link, td)
            if metainfo:
                dateobj = metainfo.get_date()
                if dateobj and dateobj >= fromdate and dateobj <= todate:
                    minfos.append(metainfo)
                paras = td.find_all('p')

                if len(paras) >= 2:
                    p = paras[1]
                    txt = utils.get_tag_contents(p)
                    reobj = re.search(
                        'Department:\s*(?P<dept>.+)\s+Order\s+Nos:\s*(,Othres\s*:)?(?P<ordernum>.*)',
                        txt)
                    if reobj:
                        groupdict = reobj.groupdict()
                        ordernum = groupdict['ordernum'].strip()
                        metainfo['department'] = groupdict['dept'].strip()
                        if re.match('[\d+(,\s*)?]+$', ordernum):
                            metainfo['ordernum'] = ordernum

                if len(paras) >= 3:
                    p = paras[2]
                    txt = utils.get_tag_contents(p)
                    if txt:
                        metainfo.set_subject(txt)

        return minfos, nextpage
Exemplo n.º 44
0
 def page_type(self, tr):
     text = utils.get_tag_contents(tr)
     if re.search(' vs ', text, re.IGNORECASE):
         return 'judgment'
     elif self.next_link(tr.findAll('a')):
         return 'nextlink'
     else:
         return 'unknown'
Exemplo n.º 45
0
 def page_type(self, tr):
     text = utils.get_tag_contents(tr)
     if re.search(' vs ', text, re.IGNORECASE):
         return 'judgment'
     elif self.next_link(tr.findAll('a')):
         return 'nextlink'
     else:
         return 'unknown'
Exemplo n.º 46
0
    def download_oneday(self, relpath, dateobj):
        dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php')
        postdata = [('d1', dateobj.day), ('m1', dateobj.month),  \
                    ('y1', dateobj.year), ('d2', dateobj.day),   \
                    ('m2', dateobj.month), ('y2', dateobj.year), \
                    ('button', 'Submit')]

        webpage = self.download_url(dateurl, postdata = postdata)

        if not webpage:
            self.logger.warning(u'No webpage for %s date: %s' % \
                                 (dateurl, dateobj))
            return []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'HTML parsing failed for date: %s' %  dateobj)
            return []

        newdls = []

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            words = href.split('/')
            filename = words[-1]

            url = urllib.basejoin(dateurl, href)

            self.logger.info(u'link: %s title: %s' % (href, title))

            relurl = os.path.join (relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                webpage = self.download_url(url)

                if not webpage:
                    self.logger.warning(u'No webpage %s' % url)
                else:
                    utils.save_file(filepath, webpage)
                    self.logger.info(u'Saved %s' % url)
                    newdls.append(relurl)

            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                metainfo = self.get_meta_info(title, dateobj)
                if metainfo:
                    utils.print_tag_file(metapath, metainfo)

        return newdls     
Exemplo n.º 47
0
    def result_page(self, webpage, relpath, dateobj):
        newdls      = []

        if not webpage:
            return newdls

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')

        for tr in trs:
            link  = tr.find('a')

            if not link:
                continue

            href  = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.info(u'Could not process %s' % link)
                continue

            if not re.match('\d+$', title) and not re.search('PREV|NEXT',title):
                self.logger.info(u'link: %s title: %s' % (href, title))
                rel = self.handle_judgment_link(relpath, tr, dateobj, href, title)
                if rel:
                    newdls.append(rel)

        if newdls:
            links  = d.findAll('a')
            for link in links:
                href  = link.get('href')
                title = utils.get_tag_contents(link)
                if title and href and re.match('NEXT', title):
                   self.logger.info(u'Following next page link: %s' % link)
                   webpage = self.download_url(urllib.basejoin(self.baseurl,href),\
                                               loadcookies = self.cookiefile.name)

                   newdls.extend(self.result_page(webpage, relpath, dateobj))
        return newdls
Exemplo n.º 48
0
    def get_next_page(self, d, baseurl):
        nextPage = None
        for link in d.findAll('a'):
            value = utils.get_tag_contents(link)
            href = link.get('href')
            if href and value and re.search('\s*Next', value):
                nextPage = urllib.basejoin(baseurl, href) 

        return nextPage
Exemplo n.º 49
0
    def get_judgment_info(self, tr):
        judgedict = {}
        link = tr.find('a') 
        if link:
            title = utils.get_tag_contents(link)
            href  = link.get('href')
            if title:
                judgedict['title'] = title

            if href:
                judgedict['href']  = href

        tds = tr.findAll('td')
        for td in tds:
            txt = utils.get_tag_contents(td)
            reobj = re.search('Coram\s*:', txt)
            if reobj and reobj.end() + 1 < len(txt):
                bench = txt[reobj.end() + 1:]
                judgedict['bench'] = bench
        return judgedict
Exemplo n.º 50
0
    def get_meta_info(self, tr):
        tds = tr.findAll('td')
        metainfo = {}
        link = tr.find('a')
        if link:
            href = link.get('href')
            if href:
                metainfo['href'] = href

        else:
            return metainfo

        valueList = []
        for td in tds:
            value = utils.get_tag_contents(td)
            valueList.append(value)

        i = 0
        for value in valueList:
            i += 1
            if value:
                value = value.strip()
                if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER):
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                        metainfo[self.CASENO] = valueList[i-1]
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif metainfo.has_key(self.PETITIONER):
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
      
        # try one more heuristics
        if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): 
            dateobj = utils.datestr_to_obj(metainfo['href'])
            if dateobj:
                metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE) and \
                not metainfo.has_key(self.PETITIONER):
            self.logger.info(u'No petitioner/date found: %s %s' % \
                              (metainfo, valueList))
        elif not metainfo.has_key(self.PETITIONER): 
            self.logger.info(u'No petitioner found: %s %s' % \
                                 (metainfo, valueList))
        elif not metainfo.has_key(self.DATE): 
            self.logger.info(u'No date found: %s %s' % \
                                 (metainfo, valueList))

        return metainfo
Exemplo n.º 51
0
    def get_meta_info(self, d, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj) }
        trs = d.findAll('tr')
        for tr in trs:
            tds = tr.findAll('td')

            i = 0
            tdtype = None
            for td in tds[:-1]:
                 content = utils.get_tag_contents(td)

                 if re.search('Case Number', content):
                     tdtype = 'caseno'
                     break

                 if re.search('Judge', content):
                     tdtype = 'author'
                     break

                 if re.search('Petitioner', content):
                     tdtype = 'petitioner'
                     break

                 if re.search('Respondent', content):
                     tdtype = 'respondent'
                     break

                 if re.search('Location', content):
                     tdtype = 'location'
                     break


                 i += 1
            if tdtype and i + 1 < len(tds):
                 content = utils.get_tag_contents(td)
                 metainfo[tdtype] = utils.get_tag_contents(tds[i+1])

        return metainfo             
Exemplo n.º 52
0
    def get_meta_info(self, d, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}
        trs = d.findAll("tr")
        for tr in trs:
            tds = tr.findAll("td")

            i = 0
            tdtype = None
            for td in tds[:-1]:
                content = utils.get_tag_contents(td)

                if re.search("Case Number", content):
                    tdtype = "caseno"
                    break

                if re.search("Judge", content):
                    tdtype = "author"
                    break

                if re.search("Petitioner", content):
                    tdtype = "petitioner"
                    break

                if re.search("Respondent", content):
                    tdtype = "respondent"
                    break

                if re.search("Location", content):
                    tdtype = "location"
                    break

                i += 1
            if tdtype and i + 1 < len(tds):
                content = utils.get_tag_contents(td)
                metainfo[tdtype] = utils.get_tag_contents(tds[i + 1])

        return metainfo
Exemplo n.º 53
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}
        tds = tr.findAll("td")
        for td in tds:
            content = utils.get_tag_contents(td)
            reobj = re.search(" vs\.? ", content, re.IGNORECASE)
            if reobj:
                metainfo["title"] = content
                metainfo["petitioner"] = content[: reobj.start()]
                metainfo["respondent"] = content[reobj.end() :]

            reobj = re.search("justice ", content, re.IGNORECASE)
            if reobj:
                metainfo["author"] = content[reobj.end() :]

        return metainfo
Exemplo n.º 54
0
    def get_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj) }
        tds = tr.findAll('td')
        for td in tds:
            content = utils.get_tag_contents(td)
            reobj = re.search(' vs\.? ', content, re.IGNORECASE)
            if reobj:
                metainfo['title']      = content
                metainfo['petitioner'] = content[:reobj.start()]
                metainfo['respondent'] = content[reobj.end():]

            reobj = re.search('justice ', content, re.IGNORECASE)
            if reobj:
                metainfo['author'] = content[reobj.end():]
                 
        return metainfo
Exemplo n.º 55
0
    def result_page(self, webpage, relpath, dateobj, linkdict):
        newdls      = []

        if not webpage:
            return newdls 

        courtParser = utils.parse_webpage(webpage)

        if not courtParser:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs  = courtParser.findAll('tr')

        for tr in trs:
            link = tr.find('a')
 
            if link:
                title = utils.get_tag_contents(link)
                href  = link.get('href')
 
                if (not title) or (not href):
                    self.logger.warning(u'Could not process %s' % link)
                    continue

                if linkdict.has_key(href):
                    continue

                if not re.search('first|prev|next|last|acroread', title, \
                                 re.IGNORECASE):
                    linkdict[href] = 1
                    dl = self.handle_link(relpath, href, title, tr, dateobj)
                    if dl:
                        newdls.append(dl)

                elif title == 'Next':
                    self.logger.info(u'Following Next page %s' % href)
                    newlink = urllib.basejoin (self.baseurl, href)
                    webpage = self.download_url(newlink, \
                                            loadcookies = self.cookiefile.name)
               
                    newdls.extend(self.result_page(webpage, relpath, dateobj, \
                                                   linkdict))
                else:
                    self.logger.info(u'No action for %s' % href)
        return newdls