예제 #1
0
        def do_parse(ruthless):
            try:
                html = deepcopy(self.html)
                for i in utils.tags(html, 'script', 'style'):
                    i.drop_tree()
                for i in utils.tags(html, 'body'):
                    i.set('id', 'readabilityBody')
                if ruthless:
                    html = utils.remove_unlikely_candidates(html)
                html = utils.transform_misused_divs_into_paragraphs(html)

                candidates = utils.score_paragraphs(html)

                # first try to get an article
                article_node = utils.get_article_element(html)
                if article_node:
                    best_candidate = article_node
                else:
                    best_candidate = select_best_candidate(candidates)

                if best_candidate:
                    # TODO: there was some logic here about retrying if the article wasn't long enough
                    return utils.sanitize(utils.get_article(candidates, best_candidate), candidates)
                else:
                    return None
            except StandardError, e:
                log.exception('error getting summary: ')
                raise Unparseable(str(e)), None, sys.exc_info()[2]
예제 #2
0
파일: clean.py 프로젝트: dotajin/haoku-open
	def clean_bads(self):
		for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'):
			if node != self.doc:
				self.drop(node)

		jpgs = 'jpg|jpeg|png|gif|bmp'.split('|')
		for node in tags(self.doc, 'img', 'a'):
			if node.tag == 'img':
				width = to_int(node.get('width'))
				height = to_int(node.get('height'))
				src = node.get('src', '')
				if not src.startswith('http://') \
						or 'themes' in src \
						or (url2filetype(src) or '').lower() not in jpgs \
						or width is not None and height is not None \
						and (width < 200 and height < 160 or width < 160 or height < 40):
					self.drop(node)
			elif node.tag == 'a' and not node.get('href', '').startswith('http://'):
				self.drop(node)
예제 #3
0
파일: clean.py 프로젝트: dotajin/haoku-open
    def clean_bads(self):
        for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'):
            if node != self.doc:
                self.drop(node)

        jpgs = 'jpg|jpeg|png|gif|bmp'.split('|')
        for node in tags(self.doc, 'img', 'a'):
            if node.tag == 'img':
                width = to_int(node.get('width'))
                height = to_int(node.get('height'))
                src = node.get('src', '')
                if not src.startswith('http://') \
                  or 'themes' in src \
                  or (url2filetype(src) or '').lower() not in jpgs \
                  or width is not None and height is not None \
                  and (width < 200 and height < 160 or width < 160 or height < 40):
                    self.drop(node)
            elif node.tag == 'a' and not node.get('href',
                                                  '').startswith('http://'):
                self.drop(node)
예제 #4
0
def main():
	""" Main "entry point" for the site. """

	# What tags are we using to filter results?
	tag_names = utils.tags()
	current_tags = [ id for (id, name) in db.get_tags(tag_names) ]
	if len(tag_names) == 0: tag_names = None

	# Render from the 'main' template.
	return flask.render_template('main.html',
			tags = tag_names,
			deadlines = db.deadlines(current_tags),
			upcoming = db.upcoming(current_tags),
			recent = db.recent(current_tags),
			utils = utils)
예제 #5
0
    def summary(self):
        if hasattr(self, 'output'):
            return self.output

        if self.doc is None:
            return ''

        MIN_LEN = self.options.get(
            'min_text_length',
            self.TEXT_LENGTH_THREASHOLD,
        )

        for node in tags(self.doc, 'form', 'iframe', 'textarea', 'table',
                         'input'):
            if node != self.doc:
                node.drop_tree()

        for img in self.doc.xpath('.//img'):
            if img.get('data-original'):
                img.set('src', img.get('data-original'))
            if img.get('original'):
                img.set('src', img.get('original'))
            if re.search('\/static\/|\.gif', img.get('src', '')):
                self.drop(img)

        click = re.compile(u'点击|>>')
        for node in self.doc.iter('a'):
            if not node.getchildren():
                if click.search(node.text_content()):
                    self.drop(node)
            else:
                for child in node.getchildren():
                    if click.search(child.text or ''):
                        self.drop(child)

        imgs = []
        for child in self.doc.getchildren():
            res = self.is_need_drop(child, False if imgs else True)
            if res == 'img':
                imgs.append(child)
                continue
            elif res == False:
                break

            self.drop(child)
            for img in imgs:
                self.drop(img)
            imgs = []

        # imgs = []
        # for child in reversed(self.doc.getchildren()):
        # 	res = self.is_need_drop(child, False if imgs else True)
        # 	if res == 'img':
        # 		imgs.append(child)
        # 		continue
        # 	elif res == False:
        # 		break

        # 	self.drop(child)
        # 	for img in imgs:
        # 		self.drop(img)
        # 	imgs = []

        # for child in self.doc.getchildren():
        # 	if self.is_bad_node(child):
        # 		self.drop(child)
        # 	elif self.texts is not None:
        # 		text = child.text_content().strip()
        # 		if text and text in self.texts:
        # 			self.drop(child)
        # 		else:
        # 			self.texts.add(text)

        self.output = self.clean()
        return self.output
예제 #6
0
	def clean_tags(self):
		for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'):
			if node != self.doc:
				self.drop(node)
예제 #7
0
	def clean_tags(self):
		for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'):
			if node != self.doc:
				self.drop(node)
예제 #8
0
	def summary(self):
		if hasattr(self, 'output'):
			return self.output

		if self.doc is None:
			return ''

		MIN_LEN = self.options.get(
			'min_text_length',
			self.TEXT_LENGTH_THREASHOLD,
		)

		for node in tags(self.doc, 'form', 'iframe', 'textarea', 'table', 'input'):
			if node != self.doc:
				node.drop_tree()

		for img in self.doc.xpath('.//img'):
			if img.get('data-original'):
				img.set('src', img.get('data-original'))
			if img.get('original'):
				img.set('src', img.get('original'))
			if re.search('\/static\/|\.gif', img.get('src', '')):
				self.drop(img)

		click = re.compile(u'点击|>>')
		for node in self.doc.iter('a'):
			if not node.getchildren():
				if click.search(node.text_content()):
					self.drop(node)
			else:
				for child in node.getchildren():
					if click.search(child.text or ''):
						self.drop(child)

		imgs = []
		for child in self.doc.getchildren():
			res = self.is_need_drop(child, False if imgs else True)
			if res == 'img':
				imgs.append(child)
				continue
			elif res == False:
				break

			self.drop(child)
			for img in imgs:
				self.drop(img)
			imgs = []

		# imgs = []
		# for child in reversed(self.doc.getchildren()):
		# 	res = self.is_need_drop(child, False if imgs else True)
		# 	if res == 'img':
		# 		imgs.append(child)
		# 		continue
		# 	elif res == False:
		# 		break

		# 	self.drop(child)
		# 	for img in imgs:
		# 		self.drop(img)
		# 	imgs = []

		# for child in self.doc.getchildren():
		# 	if self.is_bad_node(child):
		# 		self.drop(child)
		# 	elif self.texts is not None:
		# 		text = child.text_content().strip()
		# 		if text and text in self.texts:
		# 			self.drop(child)
		# 		else:
		# 			self.texts.add(text)

		self.output = self.clean()
		return self.output
예제 #9
0
def prefs():
	""" Ask the user for preferences. """
	return flask.render_template('prefs.html',
			all_tags = db.get_tags(),
			current_tags = utils.tags()
		)