Пример #1
0
    def parse_search_results(self, webpage, dateobj, curr_page):
        metainfos = []
        nextpage  = None

        d = utils.parse_webpage(webpage, self.parser)
        if not d:
            self.logger.warn('Unable to parse search result page for %s', dateobj)
            return metainfos, nextpage

        tables = d.find_all('table', {'id': self.result_table})

        if len(tables) != 1:
            self.logger.warn('Could not find the result table for %s', dateobj)
            return metainfos, nextpage
        
        order = None
        for tr in tables[0].find_all('tr'):
            if not order:
                order = self.get_column_order(tr)
                continue

            if nextpage == None:
                nextpage = self.find_next_page(tr, curr_page)
                if nextpage != None:
                    continue

            if tr.find('input') == None and tr.find('a') == None:
                continue

            self.process_result_row(tr, metainfos, dateobj, order)

        return metainfos, nextpage
Пример #2
0
    def download_extraordinary(self, dls, relpath, dateobj):
        ex_url = urllib.basejoin(self.baseurl,
                                 self.extraordinary_url % dateobj.year)

        response = self.download_url(ex_url)
        if not response or not response.webpage:
            self.logger.warn(
                'Unable to download Extraordinary gazette for year %d',
                dateobj.year)
            return

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn(
                'Unable to parse Extraordinary gazette list for year %d',
                dateobj.year)
            return

        if dateobj.year == 2010:
            minfos = self.parse_listing_webpage(ex_url, d, dateobj, None,
                                                'Extraordinary')
        else:
            minfos = self.parse_extraordinary_webpage(d, dateobj, ex_url)

        self.download_metainfos(minfos, dls, relpath)
Пример #3
0
    def download_info_page(self, url):
        webpage = self.download_url(url)
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse the date search page')
            return [], None

        links = d.findAll('a')
        infolist = []
        previousurl = None
        for link in links:
            href = link.get('href')

            if previousurl == None and href:
                anchortext = utils.get_tag_contents(link)
                if anchortext and re.search('Previous >>', anchortext):
                    previousurl = urllib.basejoin(url, href)

            if href:
                if re.match('judgements', href):
                    node = link
                    while node.name != 'tr':
                        node = node.parent
                    if node:
                        metainfo = self.get_meta_info(node)
                        metainfo['href'] = href
                        infolist.append(metainfo)
                        self.logger.debug('metainfo: %s' % metainfo)

        return infolist, previousurl
Пример #4
0
    def process_result_page(self, relpath, dateobj, webpage):
        newdls = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.info(u'Could not parse result page for date %s' %
                             dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            pagetype = self.page_type(tr)
            if pagetype == 'nextlink':
                nextlink = self.next_link(tr.findAll('a'))
                if nextlink:
                    self.logger.info(u'Going to the next page: %s' \
                                       % utils.get_tag_contents(nextlink))

                    rels = self.process_next_link(relpath, dateobj, nextlink)
                    newdls.extend(rels)
            elif pagetype == 'judgment':
                rel = self.handle_judgment_link(relpath, dateobj, tr)
                if rel:
                    newdls.append(rel)
            else:
                self.logger.info(u'Not processing %s' % tr)
        return newdls
Пример #5
0
    def parse_html_page(self, htmlPage):
        result = {}
        docs   = []

        d = utils.parse_webpage(htmlPage)

        if d:
            trs = d.findAll('tr')
            for tr in trs:
                if tr.find('table'):
                    continue
                links = tr.findAll('a')
                href = None
                if len(links) > 1:
                    for a in links:
                        h = a.get(self.HREF)
                        if h and re.search('pdf$', h):
                            href = h 
                elif len(links) == 1 and links[0].get(self.HREF):
                    href = links[0].get(self.HREF)
                if href and (not re.search('/itat/upload/blank.htm$', href)):
                    metainfo = self.get_meta_info(tr)                
                    if metainfo:
                        href = links[0].get(self.HREF)
                        if href:
                            metainfo[self.HREF] = href 
                        docs.append(metainfo)                     
        if docs:
            result[self.DOCS] = docs
       
        return result
Пример #6
0
    def process_result_page(self, relpath, dateobj, webpage):
        newdls = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.info(u'Could not parse result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            pagetype = self.page_type(tr)
            if pagetype == 'nextlink':
                nextlink =  self.next_link(tr.findAll('a'))
                if nextlink:
                    self.logger.info(u'Going to the next page: %s' \
                                       % utils.get_tag_contents(nextlink))

                    rels = self.process_next_link(relpath, dateobj, nextlink)
                    newdls.extend(rels)
            elif pagetype == 'judgment':
                rel = self.handle_judgment_link(relpath, dateobj, tr)
                if rel:
                    newdls.append(rel)
            else:
                self.logger.info(u'Not processing %s' % tr)
        return newdls
Пример #7
0
    def download_oneday(self, relpath, dateobj):
        dls = []
        postdata = self.get_post_data(dateobj)
        response = self.download_url(self.searchurl, postdata=postdata)

        if not response or not response.webpage:
            self.logger.warn('Could not download search result for date %s', \
                              dateobj)
            return dls

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn('Could not parse search result for date %s', \
                              dateobj)
            return dls

        minfos = self.get_metainfos(d, dateobj)
        for metainfo in minfos:
            if 'docid' not in metainfo:
                self.logger.warn('Ignoring metainfo: %s', metainfo)
                continue

            filename = metainfo.pop('docid')
            relurl = os.path.join(relpath, filename)
            gzurl = self.get_doc_url(filename)
            if self.save_gazette(relurl, gzurl, metainfo):
                dls.append(relurl)

        return dls
Пример #8
0
    def download_oneday(self, relpath, dateobj):
        dls = []
        datestr = utils.dateobj_to_str(dateobj, '-')
        searchurl = self.searchurl % (datestr, datestr)
        response = self.download_url(searchurl)
        if not response or not response.webpage:
            self.logger.warn('Could not download search result for date %s', \
                              dateobj)
            return dls

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn('Could not parse search result for date %s', \
                              dateobj)
            return dls

        minfos = self.parse_results(d, dateobj)
        for metainfo in minfos:
            if 'download' not in metainfo:
                self.logger.warn('No link. Ignoring metainfo: %s', metainfo)
                continue
            relurl = self.download_gazette(metainfo, searchurl, relpath)
            if relurl:
                dls.append(relurl)
        return dls
Пример #9
0
    def download_info_page(self, url):
        webpage = self.download_url(url)
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse the date search page')
            return [], None

        links = d.findAll('a')
        infolist = []
        previousurl = None
        for link in links:
            href = link.get('href')

            if previousurl == None and href:
                anchortext = utils.get_tag_contents(link)
                if anchortext and re.search('Previous >>', anchortext):
                    previousurl = urllib.basejoin(url, href)

            if href:
                 if re.match('judgements', href):
                     node = link 
                     while node.name != 'tr':
                         node = node.parent
                     if node:
                        metainfo = self.get_meta_info(node)
                        metainfo['href'] = href
                        infolist.append(metainfo)
                        self.logger.debug('metainfo: %s' % metainfo)

        return infolist, previousurl
Пример #10
0
    def result_page(self, relpath, url, dateobj, linkdict):
        newdls = []
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title) or linkdict.has_key(href):
                self.logger.warning(u'Could not process %s' % link)
                continue

            linkdict[href] = 1

            action = self.action_on_link(href, title)
            self.logger.info(u'Action %s on link %s title %s' %\
                                     (action, href, title))       
            newurl = urllib.basejoin(url, href)
            if action == 'judgmentlink':
                relurl = self.process_judgment_page(relpath, newurl, dateobj)
                if relurl:
                    newdls.append(relurl)
                else:
                    self.logger.warning(u'Judgment link not working %s' % newurl)
            elif action == 'recurse':
                newdls.extend(self.result_page(relpath, newurl, dateobj, 
                                               linkdict))
           
        return newdls
Пример #11
0
    def parse_html_page(self, htmlPage):
        result = {}
        docs   = []

        d = utils.parse_webpage(htmlPage)

        if d:
            trs = d.findAll('tr')
            for tr in trs:
                if tr.find('table'):
                    continue
                links = tr.findAll('a')
                href = None
                if len(links) > 1:
                    for a in links:
                        h = a.get(self.HREF)
                        if h and re.search('pdf$', h):
                            href = h 
                elif len(links) == 1 and links[0].get(self.HREF):
                    href = links[0].get(self.HREF)
                if href and (not re.search('/itat/upload/blank.htm$', href)):
                    metainfo = self.get_meta_info(tr)                
                    if metainfo:
                        href = links[0].get(self.HREF)
                        if href:
                            metainfo[self.HREF] = href 
                        docs.append(metainfo)                     
        if docs:
            result[self.DOCS] = docs
       
        return result
Пример #12
0
    def handle_result_page(self, resultpage, relpath, dateobj):
        dls = []
        d = utils.parse_webpage(resultpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)

        # download judgments
        trs = d.findAll('tr')
        for tr in trs:
            links = tr.findAll('a')
            if len(links) == 1:
                relurl = self.dl_judgment(relpath, tr, links[0], dateobj)
                if relurl:
                    dls.append(relurl)
            else:
                self.logger.warning(u'No action for %s' % tr)

        # next page
        links = d.findAll('a')
        for link in links:
            href = link.get('href')
            t = utils.get_tag_contents(link)
            if href and t == 'Next':
                nexturl = urllib.basejoin(self.resulturl, href)  
                resultpage = self.download_url(nexturl, \
                                            loadcookies = self.cookiefile.name)
                if resultpage:
                    self.logger.info(u'Recursing  to %s' % nexturl)
                    dls.extend(self.handle_result_page(resultpage, relpath, \
                                                       dateobj))
        return dls  
Пример #13
0
    def download_oneday(self, relpath, dateobj):
        dls = []

        webpage = self.download_url(self.dateurl, savecookies = \
                                                      self.cookiefile.name)
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse date search page')
            return dls

        forms = d.findAll('form')
        action = None
        for form in forms:
            if form.get('name') == 'WebJudgmentDateSearchForm':
                action = form.get('action')
                break

        if not action:
            self.logger.error(u'Could not find date search form')
            return dls

        posturl = urllib.basejoin(self.baseurl, action)
        postdata = self.post_data(d, dateobj)

        webpage = self.download_url(posturl, postdata = postdata, \
                                    loadcookies = self.cookiefile.name)

        dls = self.dl_result_page(relpath, posturl, webpage, dateobj, {})

        return dls
Пример #14
0
    def datequery_result(self, dateqryUrl, webpage, relpath, \
                         dateobj, currentpage):
        newdls = []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        stateval  = self.extract_state(d)
        if stateval != None and stateval != self.stateval:
            self.stateval = stateval
            self.logger.info(u'stateval changed')

        docs = self.extract_docs(d)
  
        for doc in docs:
            if doc.has_key('href'):
                href = doc['href']
                reobj = re.search('\d+$', href)
                if reobj:
                    filename = href[reobj.start():reobj.end()]
                    dlurl    = urllib.basejoin(dateqryUrl, href)
                    rel      = os.path.join (relpath, filename)
                    success  = self.download_debate(rel, dlurl, doc, dateobj)
                    if success:
                        newdls.append(rel)

        nextpage = self.get_next_page(d, dateqryUrl, currentpage)
        if nextpage:
            newdls.extend(self.datequery_result(dateqryUrl, nextpage, relpath, \
                                                dateobj, currentpage + 1))

        return newdls          
Пример #15
0
    def result_page(self, relpath, url, dateobj, linkdict):
        newdls = []
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title) or linkdict.has_key(href):
                self.logger.warning(u'Could not process %s' % link)
                continue

            linkdict[href] = 1

            action = self.action_on_link(href, title)
            self.logger.info(u'Action %s on link %s title %s' %\
                                     (action, href, title))       
            newurl = urllib.basejoin(url, href)
            if action == 'judgmentlink':
                relurl = self.process_judgment_page(relpath, newurl, dateobj)
                if relurl:
                    newdls.append(relurl)
                else:
                    self.logger.warning(u'Judgment link not working %s' % newurl)
            elif action == 'recurse':
                newdls.extend(self.result_page(relpath, newurl, dateobj, 
                                               linkdict))
           
        return newdls
Пример #16
0
    def download_info_page(self, url):
        dls = []
        nextPage = None
        webpage = self.download_url(url)
        if webpage:
            d = utils.parse_webpage(webpage)
            if not d:
                self.logger.error(u'Could not parse the date search page')
                return [], None
            nextPage = self.get_next_page(d, url)
            maxtr = -1
            mainTable = None
            tables = d.findAll('table')
            for table in tables:
                numtrs = table.findAll('tr')
                if numtrs > maxtr:
                    mainTable = table
                    maxtr = numtrs

            if mainTable:
                trs = table.findAll('tr')
                for tr in trs:
                    metainfo = self.get_meta_info(tr)
                    if metainfo and metainfo.has_key(self.DATE):
                        self.logger.debug(u'metainfo: %s' % metainfo)
                        dls.append(metainfo)
            dls.sort(cmp = lambda x, y: cmp(x[self.DATE], y[self.DATE]), \
                     reverse= True)
        return dls, nextPage
Пример #17
0
    def download_orders(self, relpath, ccin, dateobj, webpage):
        parsedDoc =  utils.parse_webpage(webpage)
        if not parsedDoc:
            self.logger.warning(u'Could not parse judgments list for doc: ccin %s date: %s' % (ccin, dateobj))
            return []
 
        trs = parsedDoc.findAll('tr')
        fieldOrder = self.get_order_of_fields(parsedDoc)

        newdls = [] 
        if 'view' in fieldOrder and 'caseno' in fieldOrder \
                and 'date' in fieldOrder:
            for tr in trs:
                if tr.find('th'):
                    continue

                relurl = self.process_order_tr(ccin, relpath, dateobj, \
                                               tr, fieldOrder)
                if relurl:
                    newdls.append(relurl)
                else:
                    self.logger.warning(u'Could not get judgment in tr: %s' % tr)
        else:
            self.logger.warning(u'Could not get field ordering in ccin %s date: %s' % (ccin, dateobj))
        return newdls
Пример #18
0
    def download_info_page(self, url):
        webpage = self.download_url(url)
        dls = [] 
        if webpage:
            d = utils.parse_webpage(webpage)
            if not d:
                self.logger.error(u'Could not parse the date search page')
                return [], None

            maxtr = -1
            mainTable = None
            tables = d.findAll('table')
            for table in tables:
                numtrs = table.findAll('tr')
                if numtrs > maxtr:
                    mainTable = table
                    maxtr = numtrs

            if mainTable:
                trs = table.findAll('tr')
                for tr in trs:
                    metainfos = self.get_meta_info(tr, url)
                    if metainfos:
                        dls.extend(metainfos)
        return dls, None
Пример #19
0
    def download_oneday(self, relpath, dateobj):
        dls = []

        webpage = self.download_url(self.dateurl, savecookies = \
                                                      self.cookiefile.name)
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse date search page')
            return dls 

        forms = d.findAll('form')
        action = None
        for form in forms:
            if form.get('name') == 'WebJudgmentDateSearchForm':
                action = form.get('action')
                break

        if not action:
            self.logger.error(u'Could not find date search form')
            return dls

        posturl  = urllib.basejoin(self.baseurl, action)
        postdata = self.post_data(d, dateobj)         

        webpage = self.download_url(posturl, postdata = postdata, \
                                    loadcookies = self.cookiefile.name)

        dls = self.dl_result_page(relpath, posturl, webpage, dateobj, {})

        return dls
Пример #20
0
    def process_judgment_page(self, relpath, url, dateobj):
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)
        if not webpage:
            self.logger.warning(u'Could not download %s' % url)
            return None

        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.warning(u'Could not parse %s' % url)
            return None

        metainfo = self.get_meta_info(d, dateobj)

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            action = self.action_on_link(href, title)
            newurl = urllib.basejoin(url, href)
            if action == 'save':
                self.logger.info(u'Downloading %s' % newurl)
                return self.get_judgment(relpath, newurl, title, metainfo)

        return None
Пример #21
0
    def download_info_page(self, url):
        dls      = []
        nextPage = None
        webpage  = self.download_url(url)
        if webpage:
            d = utils.parse_webpage(webpage)
            if not d:
                self.logger.error(u'Could not parse the date search page')
                return [], None
            nextPage = self.get_next_page(d, url)
            maxtr = -1
            mainTable = None
            tables = d.findAll('table')
            for table in tables:
                numtrs = table.findAll('tr')
                if numtrs > maxtr:
                    mainTable = table
                    maxtr = numtrs

            if mainTable:
                trs = table.findAll('tr')
                for tr in trs:
                    metainfo = self.get_meta_info(tr)
                    if metainfo and metainfo.has_key(self.DATE):
                        self.logger.debug(u'metainfo: %s' % metainfo)
                        dls.append(metainfo)
            dls.sort(cmp = lambda x, y: cmp(x[self.DATE], y[self.DATE]), \
                     reverse= True)
        return dls, nextPage 
Пример #22
0
    def download_info_page(self, url):
        webpage = self.download_url(url)
        dls = []
        if webpage:
            d = utils.parse_webpage(webpage)
            if not d:
                self.logger.error(u'Could not parse the date search page')
                return [], None

            maxtr = -1
            mainTable = None
            tables = d.findAll('table')
            for table in tables:
                numtrs = table.findAll('tr')
                if numtrs > maxtr:
                    mainTable = table
                    maxtr = numtrs

            if mainTable:
                trs = table.findAll('tr')
                for tr in trs:
                    metainfos = self.get_meta_info(tr, url)
                    if metainfos:
                        dls.extend(metainfos)
        return dls, None
Пример #23
0
    def process_judgment_page(self, relpath, url, dateobj):
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)
        if not webpage:
            self.logger.warning(u'Could not download %s' % url)
            return None

        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.warning(u'Could not parse %s' % url)
            return None

        metainfo = self.get_meta_info(d, dateobj)

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            action = self.action_on_link(href, title)
            newurl = urllib.basejoin(url, href)
            if action == 'save':
                self.logger.info(u'Downloading %s' % newurl)
                return self.get_judgment(relpath, newurl, title, metainfo)

        return None
Пример #24
0
    def download_oneday(self, relpath, dateobj):
        dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php')
        postdata = [('d1', dateobj.day), ('m1', dateobj.month),  \
                    ('y1', dateobj.year), ('d2', dateobj.day),   \
                    ('m2', dateobj.month), ('y2', dateobj.year), \
                    ('button', 'Submit')]

        webpage = self.download_url(dateurl, postdata = postdata)

        if not webpage:
            self.logger.warning(u'No webpage for %s date: %s' % \
                                 (dateurl, dateobj))
            return []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'HTML parsing failed for date: %s' %  dateobj)
            return []

        newdls = []

        for link in d.findAll('a'):
            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.warning(u'Could not process %s' % link)
                continue

            words = href.split('/')
            filename = words[-1]

            url = urllib.basejoin(dateurl, href)

            self.logger.info(u'link: %s title: %s' % (href, title))

            relurl = os.path.join (relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                webpage = self.download_url(url)

                if not webpage:
                    self.logger.warning(u'No webpage %s' % url)
                else:
                    utils.save_file(filepath, webpage)
                    self.logger.info(u'Saved %s' % url)
                    newdls.append(relurl)

            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                metainfo = self.get_meta_info(title, dateobj)
                if metainfo:
                    utils.print_tag_file(metapath, metainfo)

        return newdls     
Пример #25
0
    def get_stateval(self, url):
        webpage = self.download_url(url, \
                                    savecookies = self.cookiefile.name)

        d = utils.parse_webpage(webpage)
        if d:
            return self.extract_state(d)
        else:
            return None
Пример #26
0
    def datequery_result(self, webpage, relpath, pagenum, dateobj):
        downloaded = []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return downloaded 

        stateval  = self.extract_state(d)
        if stateval != None and stateval != self.stateval:
            self.stateval = stateval
            self.logger.info(u'stateval changed')

        linkdict = self.extract_links(d, pagenum)

        for link in linkdict['docs']:
            if (not link.has_key('title')) or (not link.has_key('href')):
                continue

            self.logger.info(u'Processing link: %s href: %s' % \
                              (link['title'], link['href']))

            filename = re.sub('/', '|', link['title'])
            filename = re.sub("'", ' ', filename)
            tmprel   = os.path.join (relpath, filename)
            rawpath  = os.path.join (self.rawdir, tmprel)
            metapath = os.path.join (self.metadir, tmprel)

            if not os.path.exists(rawpath):
                webpage = self.download_judgment(link)
                if webpage:
                    utils.save_file(rawpath, webpage)
                else:
                    self.logger.warning(u'Could not download %s' % \
                                         link['title'])

            if os.path.exists(rawpath) and not os.path.isdir(rawpath):
                if not os.path.exists(metapath) or self.updateMeta:
                    self.save_meta_tags(metapath, link, dateobj)
                downloaded.append(tmprel)
                  
        if linkdict.has_key('next'):
            link = linkdict['next']
            
            self.logger.info(u'Following page: %s href: %s' % \
                             (link['title'], link['href']))

            webpage = self.download_link(link)
            if webpage:
                nextdownloads = self.datequery_result(webpage, relpath, \
                                                      pagenum + 1, dateobj)
                downloaded.extend(nextdownloads)
            else:
                self.logger.warning(u'Could not download %s' % link['title'])

        return downloaded
Пример #27
0
    def datequery_result(self, webpage, relpath, pagenum, dateobj):
        downloaded = []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return downloaded 

        stateval  = self.extract_state(d)
        if stateval != None and stateval != self.stateval:
            self.stateval = stateval
            self.logger.info(u'stateval changed')

        linkdict = self.extract_links(d, pagenum)

        for link in linkdict['docs']:
            if (not link.has_key('title')) or (not link.has_key('href')):
                continue

            self.logger.info(u'Processing link: %s href: %s' % \
                              (link['title'], link['href']))

            filename = re.sub('/', '|', link['title'])
            filename = re.sub("'", ' ', filename)
            tmprel   = os.path.join (relpath, filename)
            rawpath  = os.path.join (self.rawdir, tmprel)
            metapath = os.path.join (self.metadir, tmprel)

            if not os.path.exists(rawpath):
                webpage = self.download_link(link)
                if webpage:
                    utils.save_file(rawpath, webpage)
                else:
                    self.logger.warning(u'Could not download %s' % \
                                         link['title'])

            if os.path.exists(rawpath) and not os.path.isdir(rawpath):
                if not os.path.exists(metapath) or self.updateMeta:
                    self.save_meta_tags(metapath, link, dateobj)
                downloaded.append(tmprel)
                  
        if linkdict.has_key('next'):
            link = linkdict['next']
            
            self.logger.info(u'Following page: %s href: %s' % \
                             (link['title'], link['href']))

            webpage = self.download_link(link)
            if webpage:
                nextdownloads = self.datequery_result(webpage, relpath, \
                                                      pagenum + 1, dateobj)
                downloaded.extend(nextdownloads)
            else:
                self.logger.warning(u'Could not download %s' % link['title'])

        return downloaded
Пример #28
0
    def result_url(self, url, relpath, pagelist, dateobj):
        newdls = []
        webpage = self.download_url(url, loadcookies=self.cookiefile.name)
        if not webpage:
            self.logger.warning(u'Could not download %s' % url)
            return newdls

        webpage = re.sub('""', '"', webpage)
        startobj = re.search('<table', webpage)
        endobj = re.search('</table>', webpage)
        if not startobj or not endobj or startobj.start() >= endobj.end():
            self.logger.warning(u'No table found')
            return newdls

        d = utils.parse_webpage(webpage[startobj.start():endobj.end()])

        if not d:
            self.logger.error(
                u'Could not parse html of the result page for url %s' % url)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            tds = tr.findAll('td')
            links = tr.findAll('a')
            metainfo = self.get_meta_info(tds, dateobj)

            for link in links:
                relurl = link.get('href')
                if not relurl:
                    self.logger.warning(u'No href in %s' % link)
                    continue

                action = self.action_on_url(relurl)
                self.logger.info(u'Action %s on %s' % (action, relurl))

                if action == 'ignore':
                    continue

                url = urllib.basejoin(self.courturl, relurl)

                if action == 'save':
                    rel = self.get_judgment(url, relpath, metainfo)
                    if rel:
                        newdls.append(rel)

                elif action == 'recurse':
                    page = self.find_page(url)
                    if page and (page not in pagelist):
                        self.logger.info(u'recursing %s' % url)
                        pagelist.append(page)
                        newdls.extend(self.result_url(url, relpath, pagelist, \
                                                      dateobj))
                    else:
                        self.logger.info(u'Not recursing %s' % url)

        return newdls
Пример #29
0
    def result_url(self, url, relpath, pagelist, dateobj):
        newdls = []
        webpage = self.download_url(url, loadcookies = self.cookiefile.name)
        if not webpage:
            self.logger.warning(u'Could not download %s' % url)
            return newdls

        webpage = re.sub('""', '"', webpage)
        startobj = re.search('<table', webpage)
        endobj   = re.search('</table>', webpage)
        if not startobj or not endobj or startobj.start() >= endobj.end():
            self.logger.warning(u'No table found')
            return newdls
    
        d = utils.parse_webpage(webpage[startobj.start():endobj.end()])

        if not d:
            self.logger.error(u'Could not parse html of the result page for url %s' % url)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            tds = tr.findAll('td')
            links = tr.findAll('a')
            metainfo = self.get_meta_info(tds, dateobj)

            for link in links:
                relurl = link.get('href')
                if not relurl:
                    self.logger.warning(u'No href in %s' % link)
                    continue

                action = self.action_on_url(relurl)
                self.logger.info(u'Action %s on %s' % (action, relurl))

                if action == 'ignore':
                    continue

                url = urllib.basejoin(self.courturl, relurl)

                if action == 'save':
                    rel = self.get_judgment(url, relpath, metainfo)
                    if rel:
                       newdls.append(rel)

                elif action == 'recurse':
                    page = self.find_page(url)
                    if page and (page not in pagelist):
                        self.logger.info(u'recursing %s' % url)
                        pagelist.append(page)
                        newdls.extend(self.result_url(url, relpath, pagelist, \
                                                      dateobj))
                    else:
                        self.logger.info(u'Not recursing %s' % url)

        return newdls    
Пример #30
0
    def get_stateval(self, url):
        webpage = self.download_url(url, \
                                    savecookies = self.cookiefile.name)

        d = utils.parse_webpage(webpage)
        if d:
            return self.extract_state(d)
        else:
            self.logger.error(u'Could not parse the date webpage')
            return None
Пример #31
0
    def download_captcha(self, search_url, webpage, cookiejar):
        d = utils.parse_webpage(webpage, self.parser)
        if d == None:
            return None

        imgs = d.find_all('img')
        for img in imgs:
            src = img.get('src')
            if src and src.find('CaptchaImage.axd') >= 0:
                captcha_url = urllib.basejoin(search_url, src)
                return self.download_url(captcha_url, loadcookies=cookiejar)
        return None
Пример #32
0
    def get_search_form(self, webpage, dateobj):
        if webpage == None:
            self.logger.warn('Unable to download the starting search page for day: %s', dateobj)
            return None 

        d = utils.parse_webpage(webpage, self.parser)
        if d == None:
            self.logger.warn('Unable to parse the search page for day: %s', dateobj)
            return None

        search_form = self.find_search_form(d)
        return search_form
Пример #33
0
    def result_page(self, relpath, webpage, dateobj, pagenum):
        newdls = []

        parsedobj = utils.parse_webpage(webpage)
        if not parsedobj:
            self.logger.error(u'Could not parse the result page')
            return newdls

        tables = parsedobj.findAll('table')
        rtable = None
        for table in tables:
            id = table.get('id')
            if id == 'ctl00_ContentPlaceHolder1_OrderGridView':
                rtable = table

        if not rtable:
            self.logger.error(u'Result table not found')
            return newdls

        postdata = self.get_post_data(parsedobj, dateobj)

        trs = rtable.findAll('tr')
        pageblock = False
        nextlink = None

        for tr in trs:
            p, n = utils.check_next_page(tr, pagenum)

            if p:
                pageblock = p
                nextlink = n
            else:
                relurl = self.process_judgment_row(tr, relpath, postdata, \
                                                   dateobj)
                if relurl:
                    newdls.append(relurl)

        # check if we need to recurse
        if pageblock:
            if nextlink:
                self.logger.info(u'Recursing after pagnum %d' % (pagenum + 1))
                self.download_url(self.cookieurl,
                                  savecookies=self.cookiefile.name)
                webpage = self.download_link(postdata, nextlink['href'])
                newdls.extend(self.result_page(relpath, webpage, \
                                               dateobj, pagenum + 1))
            else:
                self.logger.info(u'Last page %d. No more recursing' % pagenum)

        return newdls
Пример #34
0
    def download_oneday(self, relpath, dateobj):
        self.download_url(self.cookieurl, savecookies=self.cookiefile.name)
        webpage = self.download_url(self.dateurl, loadcookies=self.cookiefile.name)
        parsedobj = utils.parse_webpage(webpage)
        if not parsedobj:
            self.logger.error(u"Could not parse the date search page")
            return []

        postdata = self.get_post_data(parsedobj, dateobj)

        webpage = self.download_url(
            self.dateurl, postdata=postdata, loadcookies=self.cookiefile.name, referer=self.dateurl
        )
        return self.result_page(relpath, webpage, dateobj, 1)
Пример #35
0
    def result_page(self, webpage, relpath, dateobj):
        newdls = []

        if not webpage:
            return newdls

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(
                u'Could not parse html of the result page for date %s' %
                dateobj)
            return newdls

        trs = d.findAll('tr')

        for tr in trs:
            link = tr.find('a')

            if not link:
                continue

            href = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.info(u'Could not process %s' % link)
                continue

            if not re.match('\d+$', title) and not re.search(
                    'PREV|NEXT', title):
                self.logger.info(u'link: %s title: %s' % (href, title))
                rel = self.handle_judgment_link(relpath, tr, dateobj, href,
                                                title)
                if rel:
                    newdls.append(rel)

        if newdls:
            links = d.findAll('a')
            for link in links:
                href = link.get('href')
                title = utils.get_tag_contents(link)
                if title and href and re.match('NEXT', title):
                    self.logger.info(u'Following next page link: %s' % link)
                    webpage = self.download_url(urllib.basejoin(self.baseurl,href),\
                                                loadcookies = self.cookiefile.name)

                    newdls.extend(self.result_page(webpage, relpath, dateobj))
        return newdls
Пример #36
0
    def result_page(self, relpath, webpage, dateobj, pagenum):
        newdls = []

        parsedobj = utils.parse_webpage(webpage)
        if not parsedobj:
            self.logger.error(u'Could not parse the result page')
            return newdls

        tables = parsedobj.findAll('table')
        rtable = None
        for table in tables:
            id = table.get('id')
            if id == 'ctl00_ContentPlaceHolder1_OrderGridView':
                rtable = table

        if not rtable:
            self.logger.error(u'Result table not found')
            return newdls

        postdata = self.get_post_data(parsedobj, dateobj)

        trs = rtable.findAll('tr')
        pageblock = False
        nextlink  = None

        for tr in trs:
            p, n = utils.check_next_page(tr, pagenum)

            if p:
                pageblock = p
                nextlink  = n
            else:
                relurl = self.process_judgment_row(tr, relpath, postdata, \
                                                   dateobj)
                if relurl:
                    newdls.append(relurl)

        # check if we need to recurse 
        if pageblock:
            if nextlink:
                self.logger.info(u'Recursing after pagnum %d' %  (pagenum+1))
                self.download_url(self.cookieurl, savecookies = self.cookiefile.name)
                webpage = self.download_link(postdata, nextlink['href'])
                newdls.extend(self.result_page(relpath, webpage, \
                                               dateobj, pagenum + 1))
            else:
                self.logger.info(u'Last page %d. No more recursing' % pagenum)

        return newdls
Пример #37
0
    def download_oneday(self, relpath, dateobj):
        self.download_url(self.cookieurl, savecookies=self.cookiefile.name)
        webpage = self.download_url(self.dateurl, \
                                    loadcookies = self.cookiefile.name)
        parsedobj = utils.parse_webpage(webpage)
        if not parsedobj:
            self.logger.error(u'Could not parse the date search page')
            return []

        postdata = self.get_post_data(parsedobj, dateobj)

        webpage = self.download_url(self.dateurl, postdata = postdata, \
                                    loadcookies = self.cookiefile.name, \
                                    referer = self.dateurl)
        return self.result_page(relpath, webpage, dateobj, 1)
Пример #38
0
    def datepage_metainfos(self, url, dateobj):
        minfos = []
        response = self.download_url(url)

        if not response or not response.webpage:
            self.logger.warn('Unable to download %s. Skipping', url)
            return minfos

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn('Unable to parse %s. Skipping.', url)
            return minfos

        partnum = None
        dept    = None
        for td in d.find_all('td'):
            bgcolor = td.get('bgcolor')
            links   = td.find_all('a')
            if bgcolor == '#91BAE8' and len(links) == 0:
                partnum =  utils.get_tag_contents(td)
                partnum  = utils.remove_spaces(partnum)
                dept    = None
            elif len(links) > 0:
                reobj  = re.compile('^(strong|a)$')
                for x in td.find_all(reobj):
                    if x.name == 'strong':
                        dept = utils.get_tag_contents(x)
                        dept = utils.remove_spaces(dept)
                    elif x.name == 'a'  and partnum:
                        href  = x.get('href')
                        if not href.startswith('pdf'):
                            continue

                        title = utils.get_tag_contents(x)
                        title = utils.remove_spaces(title)

                        metainfo = utils.MetaInfo()
                        minfos.append(metainfo)

                        metainfo.set_title(title)
                        metainfo.set_date(dateobj)     
                        metainfo['partnum'] = partnum
                        if dept:
                            metainfo['department']    = dept
                        gzurl = urllib.basejoin(url, href)
                        metainfo['url'] = gzurl

        return minfos    
Пример #39
0
    def parse_result_page(self, posturl, webpage, dateobj):
        judgments = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)
            return judgments

        # get judgments
        trs = d.findAll('tr')
        for tr in trs:
            judgment = {}
            metainfo = { 'date': utils.date_to_xml(dateobj)}

            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href and re.search('WebShowJudgment.do', href):
                    t = utils.get_tag_contents(link)
                    colon = t.find(':')
                    if colon:
                        title = t[colon+1:]
                        title = title.strip()
                        metainfo['title'] = title
                        reobj = re.search(' vs\. ', title, re.IGNORECASE)
                        if reobj:
                            metainfo['petitioner'] = title[:reobj.start()]
                            metainfo['respondent'] = title[reobj.end():]
                if href and re.search('WebDownloadJudgmentDocument.do', href):
                    judgment['link'] = urllib.basejoin(posturl, href)
 
            if judgment:
                judgment['metainfo'] = metainfo
                judgments.append(judgment)
        
        # next link
        links = d.findAll('a')
        for link in links: 
            t = utils.get_tag_contents(link)          
            if re.search('Next', t):
                href = link.get('href')
             
                if href:
                    judgment = {'link': urllib.basejoin(posturl, href)}
                    judgment['next'] = True
                   
                judgments.append(judgment)
 
        return judgments
Пример #40
0
    def parse_result_page(self, posturl, webpage, dateobj):
        judgments = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)
            return judgments

        # get judgments
        trs = d.findAll('tr')
        for tr in trs:
            judgment = {}
            metainfo = {'date': utils.date_to_xml(dateobj)}

            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href and re.search('WebShowJudgment.do', href):
                    t = utils.get_tag_contents(link)
                    colon = t.find(':')
                    if colon:
                        title = t[colon + 1:]
                        title = title.strip()
                        metainfo['title'] = title
                        reobj = re.search(' vs\. ', title, re.IGNORECASE)
                        if reobj:
                            metainfo['petitioner'] = title[:reobj.start()]
                            metainfo['respondent'] = title[reobj.end():]
                if href and re.search('WebDownloadJudgmentDocument.do', href):
                    judgment['link'] = urllib.basejoin(posturl, href)

            if judgment:
                judgment['metainfo'] = metainfo
                judgments.append(judgment)

        # next link
        links = d.findAll('a')
        for link in links:
            t = utils.get_tag_contents(link)
            if re.search('Next', t):
                href = link.get('href')

                if href:
                    judgment = {'link': urllib.basejoin(posturl, href)}
                    judgment['next'] = True

                judgments.append(judgment)

        return judgments
Пример #41
0
    def parse_metainfos(self, webpage, year, fromdate, todate):
        minfos = []
        nextpage = None

        d = utils.parse_webpage(webpage, self.parser)
        if not d:
            self.logger.warn('Unable to parse results page for year %d', year)
            return minfos

        for td in d.find_all('td'):
            link = td.find('a')
            if link == None:
                continue
            img = td.find('img')
            if img:
                title = img.get('title')
                if title == 'Next' and nextpage == None:
                    nextpage = link
                continue

            metainfo = self.get_metainfo(link, td)
            if metainfo:
                dateobj = metainfo.get_date()
                if dateobj and dateobj >= fromdate and dateobj <= todate:
                    minfos.append(metainfo)
                paras = td.find_all('p')

                if len(paras) >= 2:
                    p = paras[1]
                    txt = utils.get_tag_contents(p)
                    reobj = re.search(
                        'Department:\s*(?P<dept>.+)\s+Order\s+Nos:\s*(,Othres\s*:)?(?P<ordernum>.*)',
                        txt)
                    if reobj:
                        groupdict = reobj.groupdict()
                        ordernum = groupdict['ordernum'].strip()
                        metainfo['department'] = groupdict['dept'].strip()
                        if re.match('[\d+(,\s*)?]+$', ordernum):
                            metainfo['ordernum'] = ordernum

                if len(paras) >= 3:
                    p = paras[2]
                    txt = utils.get_tag_contents(p)
                    if txt:
                        metainfo.set_subject(txt)

        return minfos, nextpage
Пример #42
0
    def download_oneday(self, relpath, dateobj):
        newdls  = []

        datestr = utils.dateobj_to_str(dateobj, '/')
        subrelpath = '/'.join(relpath.split('/')[:-1])

        postdata = [('hcjudgecode', ''), ('fromdate', datestr), \
                    ('todate', datestr), ('counter', '1')]

        webpage = self.download_url (self.pageurl, referer = self.baseurl, \
                                     loadcookies = self.cookiefile.name, \
                                     postdata = postdata)

        if not webpage:
            self.logger.warning(u'No webpage for %s' % self.pageurl)            
            return newdls

        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            if tr.find('th'):
                continue

            onclick = tr.get('onclick')
            if not onclick:
                self.logger.info(u'No onclick in %s' % tr)
                continue

            reobj = re.search('\d+', onclick) 
            if not reobj:
                continue

            ccin = reobj.group(0)
            webpage = self.download_url (self.caseurl, referer = self.baseurl, \
                                         loadcookies = self.cookiefile.name, \
                                         postdata = [('ccin', ccin)])

            if not webpage:
                self.logger.error(u'Could not get case for %s on date %s' % (ccin, dateobj))
                continue
            newdls.extend(self.download_orders(subrelpath, ccin, dateobj, webpage))
        return newdls
Пример #43
0
    def result_page(self, webpage, relpath, dateobj, linkdict):
        newdls      = []

        if not webpage:
            return newdls 

        courtParser = utils.parse_webpage(webpage)

        if not courtParser:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs  = courtParser.findAll('tr')

        for tr in trs:
            link = tr.find('a')
 
            if link:
                title = utils.get_tag_contents(link)
                href  = link.get('href')
 
                if (not title) or (not href):
                    self.logger.warning(u'Could not process %s' % link)
                    continue

                if linkdict.has_key(href):
                    continue

                if not re.search('first|prev|next|last|acroread', title, \
                                 re.IGNORECASE):
                    linkdict[href] = 1
                    dl = self.handle_link(relpath, href, title, tr, dateobj)
                    if dl:
                        newdls.append(dl)

                elif title == 'Next':
                    self.logger.info(u'Following Next page %s' % href)
                    newlink = urllib.basejoin (self.baseurl, href)
                    webpage = self.download_url(newlink, \
                                            loadcookies = self.cookiefile.name)
               
                    newdls.extend(self.result_page(webpage, relpath, dateobj, \
                                                   linkdict))
                else:
                    self.logger.info(u'No action for %s' % href)
        return newdls
Пример #44
0
    def result_page(self, webpage, relpath, dateobj):
        newdls      = []

        if not webpage:
            return newdls

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')

        for tr in trs:
            link  = tr.find('a')

            if not link:
                continue

            href  = link.get('href')
            title = utils.get_tag_contents(link)

            if (not href) or (not title):
                self.logger.info(u'Could not process %s' % link)
                continue

            if not re.match('\d+$', title) and not re.search('PREV|NEXT',title):
                self.logger.info(u'link: %s title: %s' % (href, title))
                rel = self.handle_judgment_link(relpath, tr, dateobj, href, title)
                if rel:
                    newdls.append(rel)

        if newdls:
            links  = d.findAll('a')
            for link in links:
                href  = link.get('href')
                title = utils.get_tag_contents(link)
                if title and href and re.match('NEXT', title):
                   self.logger.info(u'Following next page link: %s' % link)
                   webpage = self.download_url(urllib.basejoin(self.baseurl,href),\
                                               loadcookies = self.cookiefile.name)

                   newdls.extend(self.result_page(webpage, relpath, dateobj))
        return newdls
Пример #45
0
 def download_info_page(self, url):
     infolist = []
     webpage = self.download_url(url)
     if webpage:
         d = utils.parse_webpage(webpage)
         if not d:
             self.logger.error(u'Could not parse the date search page')
             return [], None
         tables = d.findAll('table')
         for table in tables:
             if not table.find('table'):
                 trs = table.findAll('tr')
                 for tr in trs:
                     metainfo = self.get_meta_info(tr)
                     if metainfo:
                         self.logger.debug('metainfo: %s' % metainfo)
                         infolist.append(metainfo)
     return infolist, None
Пример #46
0
 def download_info_page(self, url):
     infolist = []
     webpage = self.download_url(url)
     if webpage:
         d = utils.parse_webpage(webpage)
         if not d:
             self.logger.error(u'Could not parse the date search page')
             return [], None
         tables = d.findAll('table')
         for table in tables:
             if not table.find('table'):
                 trs = table.findAll('tr')
                 for tr in trs:
                     metainfo = self.get_meta_info(tr)
                     if metainfo:
                         self.logger.debug('metainfo: %s' % metainfo)
                         infolist.append(metainfo)
     return infolist, None
Пример #47
0
    def get_search_results(self, search_url, dateobj, cookiejar):
        response = self.download_url(search_url, savecookies = cookiejar, \
                                     loadcookies=cookiejar)

        while response and response.webpage:
            response = self.submit_captcha_form(search_url, response.webpage, \
                                                cookiejar, dateobj)

            if not response or not response.webpage:
                break
            d = utils.parse_webpage(response.webpage, self.parser)
            if d and not self.is_form_webpage(d):
                break
            else:
                self.logger.warn('Failed in solving captcha. Rerying.')
                cookiejar = CookieJar()
                response = self.download_url(search_url, savecookies=cookiejar)

        return response
Пример #48
0
    def download_oneday(self, relpath, dateobj):
        dls = []
        response = self.download_url(self.baseurl)

        if not response or not response.webpage:
            self.logger.error('Unable to download the webpage for %s', dateobj)
            return dls

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.error('Unable to parse the webpage for %s', dateobj)
            return dls

        categories = self.find_categories(d)
        for category in categories:
            dls.extend(self.download_onecat(relpath, dateobj, category))
            break

        return dls
Пример #49
0
    def download_oneday(self, relpath, dateobj):
        dls = []
        postdata = self.get_post_data(dateobj)
        response = self.download_url(self.baseurl, postdata=postdata)
        if not response or not response.webpage:
            self.logger.warn('Unable to get result page for date %s', dateobj)
            return dls

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn('Unable to parse result page for date %s',
                             dateobj)
            return dls

        result_table = None

        for table in d.find_all('table'):
            for tr in table.find_all('tr'):
                order = self.find_field_order(tr)
                if order:
                    result_table = table
                    break

        if result_table == None:
            self.logger.warn('Unable to find the result table for %s', dateobj)
            return dls

        minfos = []
        for tr in result_table.find_all('tr'):
            if tr.find('a') == None:
                continue
            metainfo = self.process_row(tr, order, dateobj)
            if metainfo:
                minfos.append(metainfo)

        for metainfo in minfos:
            href = metainfo.pop('href')
            url = urllib.basejoin(self.baseurl, href)
            relurl = os.path.join(relpath, metainfo['gznum'])
            if self.save_gazette(relurl, url, metainfo):
                dls.append(relurl)

        return dls
Пример #50
0
    def download_orders_from_page(self, relpath, dateobj, webpage):
        newdls = []

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'HTML parsing failed for date: %s' %  dateobj)
            return []

        for tr in d.findAll('tr'):
            href = None
            for link in tr.findAll('a'):
                title = utils.get_tag_contents(link)
                if re.search('view\s+order', title, re.IGNORECASE):
                    href = link.get('href')
                    break
                
            if (not href):
                self.logger.warning(u'Could not process %s' % tr)
                continue

            words = href.split('/')
            filename = words[-1]

            url = urllib.basejoin(self.dateurl, href)

            self.logger.info(u'link: %s' % href)

            relurl = os.path.join (relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                webpage = self.download_url(url)

                if not webpage:
                    self.logger.warning(u'No webpage %s' % url)
                else:
                    utils.save_file(filepath, webpage)
                    self.logger.info(u'Saved %s' % url)

            if os.path.exists(filepath):
                newdls.append(relurl)
                if self.updateMeta or not os.path.exists(metapath):
                    metainfo = self.get_meta_info(tr, dateobj)
                    self.logger.info(u'relurl: %s metainfo: %s' % (relurl, metainfo))
                    if metainfo:
                        utils.print_tag_file(metapath, metainfo)

        for link in d.findAll('a'):
            text = utils.get_tag_contents(link)
            href = link.get('href')
            if href and text and re.match('\s*next\s*$', text, re.IGNORECASE):
                url = urllib.basejoin(self.dateurl, href)
                webpage = self.download_url(url)
                if webpage:
                    self.logger.info(u'Recursing to the nextpage: %s' % url)
                    nextPageDls = self.download_orders_from_page(relpath, dateobj, webpage)
                    newdls.extend(nextPageDls)  
                else:
                    self.logger.warning(u'Could not download the next webpage: %s' % url)
        return newdls     
Пример #51
0
    def download_oneday(self, relpath, dateobj):
        newdls  = []

        pageurl = urllib.basejoin(self.baseurl, '/gujarathc/')

        datestr = utils.dateobj_to_str(dateobj, '-')
        dateurl = pageurl + 'orderdatewisedata.jsp?fdate=%s&tdate=%s' % \
                                (datestr, datestr)

        webpage = self.download_url (dateurl, referer = self.baseurl, \
                                     loadcookies = self.cookiefile.name)

        if not webpage:
            self.logger.warning(u'No webpage for %s' % dateurl)            
            return newdls

        webpage = re.sub('(?P<windowopen>window.open\([^)]+\))', \
                         self.sanitize_windowopen, webpage)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            link = tr.find('a')
            if not link:
                self.logger.info(u'No link in %s' % tr)
                continue

            href = link.get('onclick')
            if not href:
                self.logger.info(u'No href in %s' % tr)
                continue

            reobj = re.search("showoj.jsp?[^'\s]+", href)

            (start, end) = reobj.span()

            pagerelurl = href[start:end]          
            url = urllib.basejoin(pageurl, pagerelurl)

            filename = utils.url_to_filename(url, False, ['caseyr', 'caseno', \
                                                          'casetype'])

            if not filename:
                self.logger.error(u'Could not get filename for %s' % url)
                continue
            relurl   = os.path.join(relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                self.logger.info(u'Downloading %s %s' % (url, filename))
                j = self.download_url(url, loadcookies = self.cookiefile.name)
                 
                if not j:
                    self.logger.warning(u'No webpage: %s' % url)
                else:
                    self.logger.info(u'Saving %s' % filepath)
                    utils.save_file(filepath, j)
                    newdls.append(relurl)
           
            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                metainfo = self.get_meta_info(link, tr, dateobj)
                if metainfo:
                    utils.print_tag_file(metapath, metainfo)

        return newdls