def parsebbsDomDetail(self, dom_block_str, config): try: dom_row_pattern = config['dom_row_pattern'] #make dom block string become dom again, #Unreasonable for: string->dom->blockdom->blockstring->blockdom->rowdom->rowstring need to be revised doc = CustomizedSoup(dom_block_str) scraper = Scraper(dom_row_pattern) #setup scraper to scrape row string ret = scraper.match(doc) #values = scraper.extract(ret[0]); parsed_result = [] index = 1 for item in ret: value = scraper.extract(item) self.fixitem(value, config) #value['boardlink'] = config['root'] + value['boardlink']; #value['titlelink'] = config['root'] + value['titlelink']; #print value['titlelink'] value['title'] = unescape(value['title']) #SAFE TITLE #value['authorlink'] = config['root'] + value['authorlink']; parsed_result.append(value) index = index + 1 if index > 10: break except Exception, e: logging.error("failed to parse bbs in Domdetail ;schoolname= %s", config['locate']) #print e; raise
def __init__(self, pattern): self.pattern = CustomizedSoup(pattern).contents[0]