示例#1
0
    def parsebbsDomDetail(self, dom_block_str, config):
        try:
            dom_row_pattern = config['dom_row_pattern']
            #make dom block string become dom again,
            #Unreasonable for: string->dom->blockdom->blockstring->blockdom->rowdom->rowstring need to be revised
            doc = CustomizedSoup(dom_block_str)
            scraper = Scraper(dom_row_pattern)
            #setup scraper to scrape row string
            ret = scraper.match(doc)
            #values = scraper.extract(ret[0]);
            parsed_result = []
            index = 1

            for item in ret:
                value = scraper.extract(item)
                self.fixitem(value, config)
                #value['boardlink']  = config['root'] + value['boardlink'];
                #value['titlelink'] = config['root'] + value['titlelink'];
                #print value['titlelink']
                value['title'] = unescape(value['title'])
                #SAFE TITLE
                #value['authorlink'] = config['root'] + value['authorlink'];

                parsed_result.append(value)
                index = index + 1
                if index > 10: break
        except Exception, e:
            logging.error("failed to parse bbs in Domdetail ;schoolname= %s",
                          config['locate'])
            #print e;
            raise
示例#2
0
 def __init__(self, pattern):
     self.pattern = CustomizedSoup(pattern).contents[0]