예제 #1
0
    def get_judgment(self, url, relpath, metainfo):
        filename = utils.url_to_filename(url, False, ["yID", "nID", "ID"])
        if not filename:
            self.logger.warning(u"No filename for %s" % url)
            return

        rel = os.path.join(relpath, filename)
        filepath = os.path.join(self.rawdir, rel)

        if os.path.exists(filepath):
            self.logger.info(u"Already exists %s" % filepath)
        else:
            self.logger.info(u"Downloading %s" % url)
            webpage = self.download_url(url, loadcookies=self.cookiefile.name)
            if not webpage:
                self.logger.warning(u"Could not download %s" % url)
                return

            utils.save_file(filepath, webpage)
            self.logger.info(u"Saved %s" % filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, rel)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)

        return rel
예제 #2
0
    def get_judgment(self, relpath, judgment):
        relurl = None
        reobj = re.search('judgmentID=(?P<id>\d+)', judgment['link'])

        if not reobj:
            self.logger.warning(u'No judgment id in %s' % judgment['link'])
        else:
            judgmentId = reobj.groupdict()['id']
            relurl = os.path.join(relpath, judgmentId)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                pdfdoc = self.download_url(judgment['link'], \
                                           loadcookies = self.cookiefile.name)
                if pdfdoc:
                    utils.save_file(filepath, pdfdoc)
                    self.logger.info(u'Saved %s' % relurl)
                else:
                    self.logger.info(u'Did not download %s' % judgment['link'])

            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, judgment['metainfo'])
                self.logger.info(u'Saved metainfo %s' % relurl)

            if not os.path.exists(filepath):
                relurl = None

        return relurl
예제 #3
0
    def save_meta_tags(self, metapath, judgedict, dateobj):
        tagdict  = {}

        if judgedict.has_key('title'):
            title = judgedict['title']
            tagdict['title'] = title
  
            reobj = re.search('( vs | vs\.)', title, re.IGNORECASE)
            if reobj:
                if reobj.start() > 1:
                    petitioner = title[:reobj.start()]
                    tagdict['petitioner'] = petitioner

                if reobj.end() + 1 < len(title):
                    respondent = title[reobj.end() + 1:]
                    tagdict['respondent'] = respondent

        if judgedict.has_key('bench'):
            bench = judgedict['bench'].split(',')
            if len(bench) > 0:
                benchdict = {}
                benchdict['name'] = []
                for judge in bench:
                    benchdict['name'].append(judge)
                tagdict['bench'] = benchdict
 
        tagdict['date'] = utils.date_to_xml(dateobj)

        utils.print_tag_file(metapath, tagdict) 
예제 #4
0
    def save_meta_tags(self, metapath, judgedict, dateobj):
        tagdict  = {}

        if judgedict.has_key('title'):
            title = judgedict['title']
            tagdict['title'] = title
  
            reobj = re.search('( vs | vs\.)', title, re.IGNORECASE)
            if reobj:
                if reobj.start() > 1:
                    petitioner = title[:reobj.start()]
                    tagdict['petitioner'] = petitioner

                if reobj.end() + 1 < len(title):
                    respondent = title[reobj.end() + 1:]
                    tagdict['respondent'] = respondent

        if judgedict.has_key('bench'):
            bench = judgedict['bench'].split(',')
            if len(bench) > 0:
                benchdict = {}
                benchdict['name'] = []
                for judge in bench:
                    benchdict['name'].append(judge)
                tagdict['bench'] = benchdict
 
        tagdict['date'] = utils.date_to_xml(dateobj)

        utils.print_tag_file(metapath, tagdict) 
예제 #5
0
    def handle_judgment_link(self, relpath, dateobj, tr):
        links = tr.findAll('a')
        if len(links) >= 1:
            href = links[-1].get('href')
        else:
            return None

        metainfo = self.get_meta_info(tr, dateobj)
        rel = ''
        if metainfo.has_key('caseno'):
            rel += metainfo['caseno']
        else:
            if metainfo.has_key('petitioner'):
                rel += metainfo['petitioner']
            if metainfo.has_key('respondent'):
                rel += metainfo['respondent']

        if not rel:
            return None

        rel = string.replace(rel, '/', '-')
        tmprel = os.path.join(relpath, rel)
        filepath = os.path.join(self.rawdir, tmprel)

        if not os.path.exists(filepath):
            self.download_judgment(href, filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, tmprel)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)

            return tmprel
        else:
            return None
예제 #6
0
    def handle_judgment_link(self, relpath, dateobj, tr):
        links = tr.findAll('a')
        if len(links) >= 1:
            href = links[-1].get('href')
        else:
            return None

        metainfo = self.get_meta_info(tr, dateobj)
        rel = ''
        if metainfo.has_key('caseno'):
            rel += metainfo['caseno']
        else:
            if metainfo.has_key('petitioner'):
                rel += metainfo['petitioner']
            if metainfo.has_key('respondent'):
                rel += metainfo['respondent']

        if not rel:
            return None

        rel      = string.replace(rel, '/', '-')
        tmprel   = os.path.join(relpath, rel)
        filepath = os.path.join(self.rawdir, tmprel)

        if not os.path.exists(filepath):
            self.download_judgment(href, filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, tmprel)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)

            return tmprel
        else:
            return None
예제 #7
0
    def get_judgment(self, url, relpath, metainfo):
        filename = utils.url_to_filename(url, False, ['yID', 'nID', 'ID'])
        if not filename:
            self.logger.warning(u'No filename for %s' % url)
            return

        rel = os.path.join(relpath, filename)
        filepath = os.path.join(self.rawdir, rel)

        if os.path.exists(filepath):
            self.logger.info(u'Already exists %s' % filepath)
        else:
            self.logger.info(u'Downloading %s' % url)
            webpage = self.download_url(url, loadcookies=self.cookiefile.name)
            if not webpage:
                self.logger.warning(u'Could not download %s' % url)
                return

            utils.save_file(filepath, webpage)
            self.logger.info(u'Saved %s' % filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, rel)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)

        return rel
예제 #8
0
    def get_judgment(self, relpath, judgment):
        relurl = None
        reobj = re.search('judgmentID=(?P<id>\d+)', judgment['link'])

        if not reobj:
            self.logger.warning(u'No judgment id in %s' %  judgment['link'])
        else:
            judgmentId = reobj.groupdict()['id']
            relurl = os.path.join(relpath, judgmentId)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                pdfdoc = self.download_url(judgment['link'], \
                                           loadcookies = self.cookiefile.name)
                if pdfdoc:
                    utils.save_file(filepath, pdfdoc)
                    self.logger.info(u'Saved %s' % relurl)
                else:
                    self.logger.info(u'Did not download %s' % judgment['link'])

            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, judgment['metainfo'])
                self.logger.info(u'Saved metainfo %s' % relurl)

            if not os.path.exists(filepath):
                relurl = None

        return relurl
예제 #9
0
    def download_oneday(self, relpath, dateobj):
        dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php')
        postdata = [('d1', dateobj.day), ('m1', dateobj.month),  \
                    ('y1', dateobj.year), ('d2', dateobj.day),   \
                    ('m2', dateobj.month), ('y2', dateobj.year), \
                    ('button', 'Submit')]

        webpage = self.download_url(dateurl, postdata = postdata)

        if not webpage:
            self.logger.warning(u'No webpage for %s date: %s' % \
                                 (dateurl, dateobj))
            return []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'HTML parsing failed for date: %s' %  dateobj)
            return []

        newdls = []

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            words = href.split('/')
            filename = words[-1]

            url = urllib.basejoin(dateurl, href)

            self.logger.info(u'link: %s title: %s' % (href, title))

            relurl = os.path.join (relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                webpage = self.download_url(url)

                if not webpage:
                    self.logger.warning(u'No webpage %s' % url)
                else:
                    utils.save_file(filepath, webpage)
                    self.logger.info(u'Saved %s' % url)
                    newdls.append(relurl)

            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                metainfo = self.get_meta_info(title, dateobj)
                if metainfo:
                    utils.print_tag_file(metapath, metainfo)

        return newdls     
예제 #10
0
    def download_order(self, relpath, dateobj, metainfo, onclick):
        reobj = re.search('myfunViewDownLoad\s*\(\s*"(?P<ccin>\d+)"\s*,\s*"(?P<orderno>\d+)"\s*,\s*"(?P<flag>\w+)"\s*,\s*"(?P<casedetail>.+)"\s*,\s*"\w+"', onclick) 
        if not reobj:
            self.logger.warning(u'Could not get parameters in onclick: %s' % onclick)
            return None
 
        groupdict  = reobj.groupdict()
        ccin       = groupdict['ccin']
        orderno    = groupdict['orderno']
        flag       = groupdict['flag']
        casedetail = groupdict['casedetail']

        metainfo['caseno'] = casedetail
        filename   = self.get_filename(casedetail)

        if not filename:
            self.logger.warning(u'Could not get filename from %s' % casedetail)
            return None

        datestr = dateobj.__str__()

        utils.mk_dir(os.path.join(self.rawdir, self.name, datestr))
        utils.mk_dir(os.path.join(self.metadir, self.name, datestr))
        
        relurl   = os.path.join(relpath, datestr, filename)
        filepath = os.path.join(self.rawdir, relurl)
        metapath = os.path.join(self.metadir, relurl)

        if os.path.exists(filepath):
            self.logger.warning(u'Raw file already exists, skipping: %s ' % relurl)
        else:
            #ccin_no=001016200801769&order_no=2&flag=v&casedetail=MISC.CIVIL+APPLICATION%2F1769%2F2008&download_token_value_id=1367853726545
            self.logger.info(u'Downloading %s' % relurl)
            postdata = [('ccin_no', ccin), ('order_no', orderno), \
                        ('flag', flag), ('casedetail', casedetail), \
                        ('download_token_value_id', int(time.time())) ]

            webpage = self.download_url(self.orderurl, \
                                        referer=self.caseurl,\
                                        loadcookies = self.cookiefile.name,\
                                        postdata = postdata)

            if webpage:
                self.logger.info(u'Saving %s' % filepath)
                utils.save_file(filepath, webpage)
            else:
                self.logger.warning(u'Could not download ccin: %s number: %s ' % (ccin, orderno))
           
        if os.path.exists(filepath) and metainfo and \
                (self.updateMeta or not os.path.exists(metapath)):
            self.logger.info(u'Metainfo: %s' % metainfo)
            utils.print_tag_file(metapath, metainfo)

        if os.path.exists(filepath):
            return relurl

        return None
예제 #11
0
파일: lobis.py 프로젝트: edudemy/judis-re
    def handle_judgment_link(self, relpath, tr, dateobj, href, title):
        tmprel   = os.path.join(relpath, re.sub('/', '-', title))
        filepath = os.path.join(self.rawdir, tmprel)

        if not os.path.exists(filepath):
            self.get_judgment(href, filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, tmprel)
            metainfo = self.parse_meta_info(tr, dateobj)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)
            return tmprel
        else:
            return None
예제 #12
0
    def handle_judgment_link(self, relpath, tr, dateobj, href, title):
        tmprel = os.path.join(relpath, re.sub('/', '-', title))
        filepath = os.path.join(self.rawdir, tmprel)

        if not os.path.exists(filepath):
            self.get_judgment(href, filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, tmprel)
            metainfo = self.parse_meta_info(tr, dateobj)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)
            return tmprel
        else:
            return None
예제 #13
0
    def get_judgment(self, relpath, url, filename, metainfo):
        relurl = os.path.join(relpath, filename)
        filepath = os.path.join(self.rawdir, relurl)
        metapath = os.path.join(self.metadir, relurl)

        if not os.path.exists(filepath):
            webpage = self.download_url(url, loadcookies=self.cookiefile.name)
            if not webpage:
                self.logger.warning(u"Could not download judgment %s" % url)
                return None

            utils.save_file(filepath, webpage)
            self.logger.info(u"Saved %s" % filepath)

        if os.path.exists(filepath):
            if self.updateMeta or not os.path.exists(metapath):
                metainfo["url"] = url
                utils.print_tag_file(metapath, metainfo)

            return relurl
        else:
            return None
예제 #14
0
    def get_judgment(self, relpath, url, filename, metainfo):
        relurl = os.path.join(relpath, filename)
        filepath = os.path.join(self.rawdir, relurl)
        metapath = os.path.join(self.metadir, relurl)

        if not os.path.exists(filepath):
            webpage = self.download_url(url, loadcookies = self.cookiefile.name)
            if not webpage:
                self.logger.warning(u'Could not download judgment %s' % url)
                return None
       
 
            utils.save_file(filepath, webpage)
            self.logger.info(u'Saved %s' % filepath)

        if os.path.exists(filepath):
            if self.updateMeta or not os.path.exists(metapath):
                metainfo['url'] = url
                utils.print_tag_file(metapath, metainfo)

            return relurl
        else:
            return None
예제 #15
0
    def download_orders_from_page(self, relpath, dateobj, webpage):
        newdls = []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'HTML parsing failed for date: %s' % dateobj)
            return []

        for tr in d.findAll('tr'):
            href = None
            for link in tr.findAll('a'):
                title = utils.get_tag_contents(link)
                if re.search('view\s+order', title, re.IGNORECASE):
                    href = link.get('href')
                    break

            if (not href):
                self.logger.warning(u'Could not process %s' % tr)
                continue

            words = href.split('/')
            filename = words[-1]

            url = urllib.basejoin(self.dateurl, href)

            self.logger.info(u'link: %s' % href)

            relurl = os.path.join(relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                webpage = self.download_url(url)

                if not webpage:
                    self.logger.warning(u'No webpage %s' % url)
                else:
                    utils.save_file(filepath, webpage)
                    self.logger.info(u'Saved %s' % url)

            if os.path.exists(filepath):
                newdls.append(relurl)
                if self.updateMeta or not os.path.exists(metapath):
                    metainfo = self.get_meta_info(tr, dateobj)
                    self.logger.info(u'relurl: %s metainfo: %s' %
                                     (relurl, metainfo))
                    if metainfo:
                        utils.print_tag_file(metapath, metainfo)

        for link in d.findAll('a'):
            text = utils.get_tag_contents(link)
            href = link.get('href')
            if href and text and re.match('\s*next\s*$', text, re.IGNORECASE):
                url = urllib.basejoin(self.dateurl, href)
                webpage = self.download_url(url)
                if webpage:
                    self.logger.info(u'Recursing to the nextpage: %s' % url)
                    nextPageDls = self.download_orders_from_page(
                        relpath, dateobj, webpage)
                    newdls.extend(nextPageDls)
                else:
                    self.logger.warning(
                        u'Could not download the next webpage: %s' % url)
        return newdls
예제 #16
0
파일: gujarat.py 프로젝트: edudemy/judis-re
    def download_oneday(self, relpath, dateobj):
        newdls  = []

        pageurl = urllib.basejoin(self.baseurl, '/gujarathc/')

        datestr = utils.dateobj_to_str(dateobj, '-')
        dateurl = pageurl + 'orderdatewisedata.jsp?fdate=%s&tdate=%s' % \
                                (datestr, datestr)

        webpage = self.download_url (dateurl, referer = self.baseurl, \
                                     loadcookies = self.cookiefile.name)

        if not webpage:
            self.logger.warning(u'No webpage for %s' % dateurl)            
            return newdls

        webpage = re.sub('(?P<windowopen>window.open\([^)]+\))', \
                         self.sanitize_windowopen, webpage)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            link = tr.find('a')
            if not link:
                self.logger.info(u'No link in %s' % tr)
                continue

            href = link.get('onclick')
            if not href:
                self.logger.info(u'No href in %s' % tr)
                continue

            reobj = re.search("showoj.jsp?[^'\s]+", href)

            (start, end) = reobj.span()

            pagerelurl = href[start:end]          
            url = urllib.basejoin(pageurl, pagerelurl)

            filename = utils.url_to_filename(url, False, ['caseyr', 'caseno', \
                                                          'casetype'])

            if not filename:
                self.logger.error(u'Could not get filename for %s' % url)
                continue
            relurl   = os.path.join(relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                self.logger.info(u'Downloading %s %s' % (url, filename))
                j = self.download_url(url, loadcookies = self.cookiefile.name)
                 
                if not j:
                    self.logger.warning(u'No webpage: %s' % url)
                else:
                    self.logger.info(u'Saving %s' % filepath)
                    utils.save_file(filepath, j)
                    newdls.append(relurl)
           
            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                metainfo = self.get_meta_info(link, tr, dateobj)
                if metainfo:
                    utils.print_tag_file(metapath, metainfo)

        return newdls
예제 #17
0
 def save_meta_tags(self, metapath, judgedict, dateobj):
     tagdict = {'date': utils.date_to_xml(dateobj)}
     for k in judgedict.keys():
         if k not in [self.HREF]:
             tagdict[k] = judgedict[k]
     utils.print_tag_file(metapath, tagdict)
예제 #18
0
파일: itat.py 프로젝트: edudemy/judis-re
 def save_meta_tags(self, metapath, judgedict, dateobj):
     tagdict = {'date': utils.date_to_xml(dateobj)}
     for k in judgedict.keys():
         if k not in [self.HREF]:
             tagdict[k] = judgedict[k]
     utils.print_tag_file(metapath, tagdict)
예제 #19
0
 def save_meta_tags(self, metapath, debatedict, dateobj):
     tagdict = {'date': utils.date_to_xml(dateobj)}
     for k in debatedict.keys():
         if k not in ['href']:
             tagdict[k] = debatedict[k]    
     utils.print_tag_file(metapath, tagdict)
예제 #20
0
    def download_orders_from_page(self, relpath, dateobj, webpage):
        newdls = []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'HTML parsing failed for date: %s' %  dateobj)
            return []

        for tr in d.findAll('tr'):
            href = None
            for link in tr.findAll('a'):
                title = utils.get_tag_contents(link)
                if re.search('view\s+order', title, re.IGNORECASE):
                    href = link.get('href')
                    break
                
            if (not href):
                self.logger.warning(u'Could not process %s' % tr)
                continue

            words = href.split('/')
            filename = words[-1]

            url = urllib.basejoin(self.dateurl, href)

            self.logger.info(u'link: %s' % href)

            relurl = os.path.join (relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                webpage = self.download_url(url)

                if not webpage:
                    self.logger.warning(u'No webpage %s' % url)
                else:
                    utils.save_file(filepath, webpage)
                    self.logger.info(u'Saved %s' % url)

            if os.path.exists(filepath):
                newdls.append(relurl)
                if self.updateMeta or not os.path.exists(metapath):
                    metainfo = self.get_meta_info(tr, dateobj)
                    self.logger.info(u'relurl: %s metainfo: %s' % (relurl, metainfo))
                    if metainfo:
                        utils.print_tag_file(metapath, metainfo)

        for link in d.findAll('a'):
            text = utils.get_tag_contents(link)
            href = link.get('href')
            if href and text and re.match('\s*next\s*$', text, re.IGNORECASE):
                url = urllib.basejoin(self.dateurl, href)
                webpage = self.download_url(url)
                if webpage:
                    self.logger.info(u'Recursing to the nextpage: %s' % url)
                    nextPageDls = self.download_orders_from_page(relpath, dateobj, webpage)
                    newdls.extend(nextPageDls)  
                else:
                    self.logger.warning(u'Could not download the next webpage: %s' % url)
        return newdls     
예제 #21
0
파일: bombay.py 프로젝트: hargup/judis-re
 def store_meta_tags(self, metapath, metainfo):
     utils.print_tag_file(metapath, metainfo)