示例#1
0
    def get_meta_info(self, tr):
        tds = tr.findAll('td')
        metainfo = {}
        link = tr.find('a')
        if link:
            href = link.get('href')
            if href:
                metainfo['href'] = href

        else:
            return metainfo

        valueList = []
        for td in tds:
            value = utils.get_tag_contents(td)
            valueList.append(value)

        i = 0
        for value in valueList:
            i += 1
            if value:
                value = value.strip()
                if (i == 2
                        or i == 3) and not metainfo.has_key(self.PETITIONER):
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                        metainfo[self.CASENO] = valueList[i - 1]
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif metainfo.has_key(self.PETITIONER):
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj

        # try one more heuristics
        if not metainfo.has_key(self.DATE) and metainfo.has_key('href'):
            dateobj = utils.datestr_to_obj(metainfo['href'])
            if dateobj:
                metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE) and \
                not metainfo.has_key(self.PETITIONER):
            self.logger.info(u'No petitioner/date found: %s %s' % \
                              (metainfo, valueList))
        elif not metainfo.has_key(self.PETITIONER):
            self.logger.info(u'No petitioner found: %s %s' % \
                                 (metainfo, valueList))
        elif not metainfo.has_key(self.DATE):
            self.logger.info(u'No date found: %s %s' % \
                                 (metainfo, valueList))

        return metainfo
示例#2
0
文件: tdsat.py 项目: edudemy/judis-re
    def get_meta_info(self, tr):
        tds = tr.findAll('td')
        metainfo = {}
        link = tr.find('a')
        if link:
            href = link.get('href')
            if href:
                metainfo['href'] = href

        else:
            return metainfo

        valueList = []
        for td in tds:
            value = utils.get_tag_contents(td)
            valueList.append(value)

        i = 0
        for value in valueList:
            i += 1
            if value:
                value = value.strip()
                if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER):
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                        metainfo[self.CASENO] = valueList[i-1]
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif metainfo.has_key(self.PETITIONER):
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
      
        # try one more heuristics
        if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): 
            dateobj = utils.datestr_to_obj(metainfo['href'])
            if dateobj:
                metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE) and \
                not metainfo.has_key(self.PETITIONER):
            self.logger.info(u'No petitioner/date found: %s %s' % \
                              (metainfo, valueList))
        elif not metainfo.has_key(self.PETITIONER): 
            self.logger.info(u'No petitioner found: %s %s' % \
                                 (metainfo, valueList))
        elif not metainfo.has_key(self.DATE): 
            self.logger.info(u'No date found: %s %s' % \
                                 (metainfo, valueList))

        return metainfo
示例#3
0
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        i = 0
        lastcolumn = len(tds) - 1
        for td in tds:
            content = utils.get_tag_contents(td)
            if content:
                if i == 1:
                    content = u' '.join(content.split())
                    metainfo['caseno'] = content
                elif i == 2:
                    petitioner, respondent = \
                            utils.get_petitioner_respondent(content)
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    else:
                        self.logger.info(u'Petitioner not found in %s' %
                                         content)
                    if respondent:
                        metainfo['respondent'] = respondent
                elif i == lastcolumn:
                    dateobj = utils.datestr_to_obj(content)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
                    else:
                        self.logger.info(u'No date in %s' % (content))
                i += 1
        return metainfo
示例#4
0
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        for link in tr.findAll('a'):
            href = link.get('href')
            if href:
                metainfo['href'] = href
                break
        if not metainfo.has_key('href'):
            return {}
        i = 0
        for td in tds:
            value = utils.get_tag_contents(td)
            if value:
                if i == 0:
                    metainfo[self.CASENO] = value
                elif i == 1:
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif i == 2:
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
                i += 1
        return metainfo
示例#5
0
 def get_meta_info(self, tr):
     metainfo = {}
     tds = tr.findAll('td')
    
     for link in tr.findAll('a'):
         href = link.get('href')
         if href:
             metainfo['href'] = href
             break
     if not metainfo.has_key('href'):
         return {}
     i = 0
     for td in tds:
         value = utils.get_tag_contents(td)
         if value:
             if i == 0:
                 metainfo[self.CASENO] = value
             elif i == 1:
                 pet, res = utils.get_petitioner_respondent(value)
                 if pet:
                     metainfo[self.PETITIONER] = pet
                 if res:
                     metainfo[self.RESPONDENT] = res
             elif i == 2:
                 dateobj = utils.datestr_to_obj(value)
                 if dateobj:
                     metainfo[self.DATE] = dateobj
             i += 1
     return metainfo
示例#6
0
文件: aptel.py 项目: hargup/judis-re
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        i = 0
        lastcolumn = len(tds) - 1
        for td in tds:
            content = utils.get_tag_contents(td)
            if content:
                if i == 1:
                    content = u' '.join(content.split())
                    metainfo['caseno'] = content
                elif i == 2:
                    petitioner, respondent = \
                            utils.get_petitioner_respondent(content)
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    else:
                        self.logger.info(u'Petitioner not found in %s' % content)
                    if respondent:
                        metainfo['respondent'] = respondent 
                elif i == lastcolumn:
                    dateobj = utils.datestr_to_obj(content)
                    if dateobj:
                       metainfo[self.DATE] = dateobj
                    else:
                        self.logger.info(u'No date in %s' % (content))
                i += 1
        return metainfo
示例#7
0
    def process_order_tr(self, ccin, relpath, dateobj, tr, fieldOrder):
        tds =  tr.findAll('td')
        viewIndex  = fieldOrder['view']
        dateIndex  = fieldOrder['date']
        if viewIndex >= len(tds) or dateIndex >= len(tds):
            self.logger.warning(u'Could not get date or view in tr: %s' % tr)
            return None

        viewTd  = tds[viewIndex]
        dateTd  = tds[dateIndex]

        datestr = utils.get_tag_contents(dateTd)

        if not datestr:
            self.logger.warning(u'Date: %s Could not get date in %s' % (dateobj, tr))
            return None

        subdateobj = utils.datestr_to_obj(datestr)
        if not subdateobj:
            self.logger.warning(u'Date: %s Could not get date in %s tr: %s' % (dateobj, datestr, tr))
            return None

        subdateobj = subdateobj.date() 
        metainfo = {'date':utils.date_to_xml(subdateobj), 'ccin': ccin}

        # store bench in metainfo
        if 'bench' in fieldOrder and fieldOrder['bench'] < len(tds):
            benchIndex = fieldOrder['bench']
            benchTd = tds[benchIndex]
            contents = utils.get_tag_contents(benchTd)
            if contents:
                names = []
                for reobj in re.finditer('JUSTICE ', contents):
                    names.append(contents[reobj.end():])
                if names:
                    metainfo['bench'] = {} 
                    metainfo['bench']['name'] = names

        # store isJudgment in metainfo
        if 'judgment' in fieldOrder and fieldOrder['judgment'] < len(tds):
            jTd = tds[fieldOrder['judgment']]
            contents = utils.get_tag_contents(jTd)
            if contents:
                metainfo['judgment'] = contents

        onclick  = viewTd.get('onclick')
        if onclick:
            relurl = self.download_order(relpath, subdateobj, \
                                             metainfo, onclick)
            return relurl
        else:
             self.logger.warning(u'No onclick attribute in viewTd: %s' % viewTd)
        return None 
示例#8
0
文件: cci.py 项目: hargup/judis-re
    def get_meta_info(self, tr, baseurl):
        metainfo = {} 
        tds = tr.findAll('td')
        i = 0
        for td in tds:
            value = utils.get_tag_contents(td)
            i += 1
            if value:
                if i == 1:
                    metainfo[self.CASENO] = value
                elif i == 2:
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                    else:
                        metainfo[self.PETITIONER] = value

                    if res:
                        metainfo[self.RESPONDENT] = res 
                elif i == 3 or i == 4:
                   dateobj = utils.datestr_to_obj(value)
                   if dateobj:
                       metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE):
            self.logger.info(u'No date found %s' % metainfo)

        ms = []
        if metainfo:
            self.logger.debug(u'metainfo: %s' % metainfo)
            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href:
                    m = metainfo.copy()
                    m['href'] = href
                    m['url']  =  urllib.basejoin(baseurl, href)
                    ms.append(m)
        return ms
示例#9
0
    def get_meta_info(self, tr, baseurl):
        metainfo = {}
        tds = tr.findAll('td')
        i = 0
        for td in tds:
            value = utils.get_tag_contents(td)
            i += 1
            if value:
                if i == 1:
                    metainfo[self.CASENO] = value
                elif i == 2:
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                    else:
                        metainfo[self.PETITIONER] = value

                    if res:
                        metainfo[self.RESPONDENT] = res
                elif i == 3 or i == 4:
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE):
            self.logger.info(u'No date found %s' % metainfo)

        ms = []
        if metainfo:
            self.logger.debug(u'metainfo: %s' % metainfo)
            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href:
                    m = metainfo.copy()
                    m['href'] = href
                    m['url'] = urllib.basejoin(baseurl, href)
                    ms.append(m)
        return ms