예제 #1
0
	def sort(self, pages):
		file_type = url2filetype(self.url) or ''

		prefix = self.url[:-len(file_type) - 1]
		if re.match('[_\-]\d$', prefix):
			prefix = self.url[:-len(file_type) - 3]
		prefix_len = len(prefix)

		res = {}
		for url in pages:
			num = re.search('\d+', url[prefix_len:-len(file_type) - 1])
			res[url] = int(num.group(0)) if num else 0

		return [x[0] for x in sorted(res.iteritems(), key=lambda x: x[1])]
예제 #2
0
    def sort(self, pages):
        file_type = url2filetype(self.url) or ''

        prefix = self.url[:-len(file_type) - 1]
        if re.match('[_\-]\d$', prefix):
            prefix = self.url[:-len(file_type) - 3]
        prefix_len = len(prefix)

        res = {}
        for url in pages:
            num = re.search('\d+', url[prefix_len:-len(file_type) - 1])
            res[url] = int(num.group(0)) if num else 0

        return [x[0] for x in sorted(res.iteritems(), key=lambda x: x[1])]
예제 #3
0
파일: clean.py 프로젝트: dotajin/haoku-open
	def clean_bads(self):
		for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'):
			if node != self.doc:
				self.drop(node)

		jpgs = 'jpg|jpeg|png|gif|bmp'.split('|')
		for node in tags(self.doc, 'img', 'a'):
			if node.tag == 'img':
				width = to_int(node.get('width'))
				height = to_int(node.get('height'))
				src = node.get('src', '')
				if not src.startswith('http://') \
						or 'themes' in src \
						or (url2filetype(src) or '').lower() not in jpgs \
						or width is not None and height is not None \
						and (width < 200 and height < 160 or width < 160 or height < 40):
					self.drop(node)
			elif node.tag == 'a' and not node.get('href', '').startswith('http://'):
				self.drop(node)
예제 #4
0
	def get_pages(self):
		file_type = url2filetype(self.url)
		if not file_type:
			return []
		pages = set([self.url])
		prefix = self.url[:-len(file_type) - 1]
		if re.match('.*[_\-]\d$', prefix):
			prefix = self.url[:-len(file_type) - 3]
		prefix_len = len(prefix)

		for node in self.doc.iter('a'):
			href = node.get('href').strip() if node.get('href') else None
			if href and len(href) > prefix_len + 2 \
					and href[:prefix_len] == prefix:
				href = href.split('#')[0].split('?')[0]
				pages.add(href)

		if len(pages) > 1:
			return list(pages)
		return []
예제 #5
0
    def get_pages(self):
        file_type = url2filetype(self.url)
        if not file_type:
            return []
        pages = set([self.url])
        prefix = self.url[:-len(file_type) - 1]
        if re.match('.*[_\-]\d$', prefix):
            prefix = self.url[:-len(file_type) - 3]
        prefix_len = len(prefix)

        for node in self.doc.iter('a'):
            href = node.get('href').strip() if node.get('href') else None
            if href and len(href) > prefix_len + 2 \
              and href[:prefix_len] == prefix:
                href = href.split('#')[0].split('?')[0]
                pages.add(href)

        if len(pages) > 1:
            return list(pages)
        return []
예제 #6
0
파일: clean.py 프로젝트: dotajin/haoku-open
    def clean_bads(self):
        for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'):
            if node != self.doc:
                self.drop(node)

        jpgs = 'jpg|jpeg|png|gif|bmp'.split('|')
        for node in tags(self.doc, 'img', 'a'):
            if node.tag == 'img':
                width = to_int(node.get('width'))
                height = to_int(node.get('height'))
                src = node.get('src', '')
                if not src.startswith('http://') \
                  or 'themes' in src \
                  or (url2filetype(src) or '').lower() not in jpgs \
                  or width is not None and height is not None \
                  and (width < 200 and height < 160 or width < 160 or height < 40):
                    self.drop(node)
            elif node.tag == 'a' and not node.get('href',
                                                  '').startswith('http://'):
                self.drop(node)
예제 #7
0
    def parse(self):
        file_type = url2filetype(self.article.url)
        if not file_type:
            return {'urls': []}
        pages = set([self.article.url])
        prefix = self.article.url[:-len(file_type) - 1]
        if re.match('.*[_\-]\d$', prefix):
            prefix = self.article.url[:-len(file_type) - 3]
        prefix_len = len(prefix)

        for node in self.article.doc.iter('a'):
            href = node.get('href').strip() if node.get('href') else None
            if href and len(href) > prefix_len + 2 \
              and href[:prefix_len] == prefix:
                href = href.split('#')[0].split('?')[0]
                pages.add(href)

        pages = list(pages)
        if len(pages) == 1:
            pages = []
        return {'urls': pages}