Пример #1
0
    def extract_links(self, prsdobj, pagenum):
        linkdict = {'docs': []}
 
        trs =  prsdobj.findAll('tr')
        for tr in trs:
            pageblock, nextlink = utils.check_next_page(tr, pagenum)
            if nextlink:
                linkdict['next'] = nextlink
            elif not pageblock:
                link = self.get_judgment_info(tr)
                if link:
                    linkdict['docs'].append(link)

        return linkdict
Пример #2
0
    def result_page(self, relpath, webpage, dateobj, pagenum):
        newdls = []

        parsedobj = utils.parse_webpage(webpage)
        if not parsedobj:
            self.logger.error(u'Could not parse the result page')
            return newdls

        tables = parsedobj.findAll('table')
        rtable = None
        for table in tables:
            id = table.get('id')
            if id == 'ctl00_ContentPlaceHolder1_OrderGridView':
                rtable = table

        if not rtable:
            self.logger.error(u'Result table not found')
            return newdls

        postdata = self.get_post_data(parsedobj, dateobj)

        trs = rtable.findAll('tr')
        pageblock = False
        nextlink = None

        for tr in trs:
            p, n = utils.check_next_page(tr, pagenum)

            if p:
                pageblock = p
                nextlink = n
            else:
                relurl = self.process_judgment_row(tr, relpath, postdata, \
                                                   dateobj)
                if relurl:
                    newdls.append(relurl)

        # check if we need to recurse
        if pageblock:
            if nextlink:
                self.logger.info(u'Recursing after pagnum %d' % (pagenum + 1))
                self.download_url(self.cookieurl,
                                  savecookies=self.cookiefile.name)
                webpage = self.download_link(postdata, nextlink['href'])
                newdls.extend(self.result_page(relpath, webpage, \
                                               dateobj, pagenum + 1))
            else:
                self.logger.info(u'Last page %d. No more recursing' % pagenum)

        return newdls
Пример #3
0
    def extract_links(self, prsdobj, pagenum):
        linkdict = {'docs': []}
 
        trs =  prsdobj.findAll('tr')
        for tr in trs:
            pageblock, nextlink = utils.check_next_page(tr, pagenum)
            if nextlink:
                linkdict['next'] = nextlink
            elif not pageblock:
                link = self.get_judgment_info(tr)
                if link:
                    linkdict['docs'].append(link)

        return linkdict
Пример #4
0
    def result_page(self, relpath, webpage, dateobj, pagenum):
        newdls = []

        parsedobj = utils.parse_webpage(webpage)
        if not parsedobj:
            self.logger.error(u'Could not parse the result page')
            return newdls

        tables = parsedobj.findAll('table')
        rtable = None
        for table in tables:
            id = table.get('id')
            if id == 'ctl00_ContentPlaceHolder1_OrderGridView':
                rtable = table

        if not rtable:
            self.logger.error(u'Result table not found')
            return newdls

        postdata = self.get_post_data(parsedobj, dateobj)

        trs = rtable.findAll('tr')
        pageblock = False
        nextlink  = None

        for tr in trs:
            p, n = utils.check_next_page(tr, pagenum)

            if p:
                pageblock = p
                nextlink  = n
            else:
                relurl = self.process_judgment_row(tr, relpath, postdata, \
                                                   dateobj)
                if relurl:
                    newdls.append(relurl)

        # check if we need to recurse 
        if pageblock:
            if nextlink:
                self.logger.info(u'Recursing after pagnum %d' %  (pagenum+1))
                self.download_url(self.cookieurl, savecookies = self.cookiefile.name)
                webpage = self.download_link(postdata, nextlink['href'])
                newdls.extend(self.result_page(relpath, webpage, \
                                               dateobj, pagenum + 1))
            else:
                self.logger.info(u'Last page %d. No more recursing' % pagenum)

        return newdls
Пример #5
0
    def extract_links(self, prsdobj, pagenum):
        linkdict = {'docs': []}

        tables = prsdobj.findAll('table')
        grid   = None
        for table in tables:
            className = table.get('class')
            if className == 'Grid':
                grid = table
                break
        if grid:
            trs =  grid.findAll('tr')
            for tr in trs:
                link = self.get_judgment_info(tr)
                if link and link.has_key('title'):
                    linkdict['docs'].append(link)
                else: 
                    pageblock, nextlink = utils.check_next_page(tr, pagenum)
                    if nextlink:
                        linkdict['next'] = nextlink
                    else:
                        self.logger.debug(u'Ignoring tr: %s' % tr)

        return linkdict
Пример #6
0
    def extract_links(self, prsdobj, pagenum):
        linkdict = {'docs': []}

        tables = prsdobj.findAll('table')
        grid   = None
        for table in tables:
            className = table.get('class')
            if className and className[0] == 'Grid':
                grid = table
                break
        if grid:
            trs =  grid.findAll('tr')
            for tr in trs:
                link = self.get_judgment_info(tr)
                if link and link.has_key('title'):
                    linkdict['docs'].append(link)
                else: 
                    pageblock, nextlink = utils.check_next_page(tr, pagenum)
                    if nextlink:
                        linkdict['next'] = nextlink
                    else:
                        self.logger.debug(u'Ignoring tr: %s' % tr)

        return linkdict