def do_parse(ruthless): try: html = deepcopy(self.html) for i in utils.tags(html, 'script', 'style'): i.drop_tree() for i in utils.tags(html, 'body'): i.set('id', 'readabilityBody') if ruthless: html = utils.remove_unlikely_candidates(html) html = utils.transform_misused_divs_into_paragraphs(html) candidates = utils.score_paragraphs(html) # first try to get an article article_node = utils.get_article_element(html) if article_node: best_candidate = article_node else: best_candidate = select_best_candidate(candidates) if best_candidate: # TODO: there was some logic here about retrying if the article wasn't long enough return utils.sanitize(utils.get_article(candidates, best_candidate), candidates) else: return None except StandardError, e: log.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2]
def clean_bads(self): for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'): if node != self.doc: self.drop(node) jpgs = 'jpg|jpeg|png|gif|bmp'.split('|') for node in tags(self.doc, 'img', 'a'): if node.tag == 'img': width = to_int(node.get('width')) height = to_int(node.get('height')) src = node.get('src', '') if not src.startswith('http://') \ or 'themes' in src \ or (url2filetype(src) or '').lower() not in jpgs \ or width is not None and height is not None \ and (width < 200 and height < 160 or width < 160 or height < 40): self.drop(node) elif node.tag == 'a' and not node.get('href', '').startswith('http://'): self.drop(node)
def main(): """ Main "entry point" for the site. """ # What tags are we using to filter results? tag_names = utils.tags() current_tags = [ id for (id, name) in db.get_tags(tag_names) ] if len(tag_names) == 0: tag_names = None # Render from the 'main' template. return flask.render_template('main.html', tags = tag_names, deadlines = db.deadlines(current_tags), upcoming = db.upcoming(current_tags), recent = db.recent(current_tags), utils = utils)
def summary(self): if hasattr(self, 'output'): return self.output if self.doc is None: return '' MIN_LEN = self.options.get( 'min_text_length', self.TEXT_LENGTH_THREASHOLD, ) for node in tags(self.doc, 'form', 'iframe', 'textarea', 'table', 'input'): if node != self.doc: node.drop_tree() for img in self.doc.xpath('.//img'): if img.get('data-original'): img.set('src', img.get('data-original')) if img.get('original'): img.set('src', img.get('original')) if re.search('\/static\/|\.gif', img.get('src', '')): self.drop(img) click = re.compile(u'点击|>>') for node in self.doc.iter('a'): if not node.getchildren(): if click.search(node.text_content()): self.drop(node) else: for child in node.getchildren(): if click.search(child.text or ''): self.drop(child) imgs = [] for child in self.doc.getchildren(): res = self.is_need_drop(child, False if imgs else True) if res == 'img': imgs.append(child) continue elif res == False: break self.drop(child) for img in imgs: self.drop(img) imgs = [] # imgs = [] # for child in reversed(self.doc.getchildren()): # res = self.is_need_drop(child, False if imgs else True) # if res == 'img': # imgs.append(child) # continue # elif res == False: # break # self.drop(child) # for img in imgs: # self.drop(img) # imgs = [] # for child in self.doc.getchildren(): # if self.is_bad_node(child): # self.drop(child) # elif self.texts is not None: # text = child.text_content().strip() # if text and text in self.texts: # self.drop(child) # else: # self.texts.add(text) self.output = self.clean() return self.output
def clean_tags(self): for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'): if node != self.doc: self.drop(node)
def prefs(): """ Ask the user for preferences. """ return flask.render_template('prefs.html', all_tags = db.get_tags(), current_tags = utils.tags() )