Python html2markup示例，reader.utils.html2markup Python示例

示例#1

0

显示文件

文件： builtin.py 项目： duvholt/Hacker-News-Reader

    def userpage(self, username):
        soup = fetch.userpage(username=username)
        try:
            i = 1
            if self.is_event(soup.html.body.table.find_all('table')[1]):
                i += 1
            userdata = soup.html.body.table.find_all('table')[i].find_all('tr')
        except AttributeError:
            raise CouldNotParse('Couldn\'t get userdata' + username)
        created = utils.parse_time(userdata[1].find_all('td')[1].decode_contents())
        try:
            avg = Decimal(userdata[3].find_all('td')[1].decode_contents())
        except InvalidOperation:
            avg = 0
        # If user is logged in there will be an editable textarea instead of just text
        if userdata[4].find_all('td')[1].textarea:
            about = userdata[4].find_all('td')[1].textarea.decode_contents()
        else:
            about = utils.html2markup(userdata[4].find_all('td')[1].decode_contents())

        UserInfo(
            username=username,
            created=created,
            karma=int(userdata[2].find_all('td')[1].decode_contents(), 10),
            avg=avg,
            about=about,
            cache=timezone.now()
        ).save()

示例#2

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def traverse_comments(self, comment, parent_object=None):
     if not parent_object and not self.story:
         parent_object = self.parent(comment['parent'])
     HNComment = models.HNComments()
     if 'deleted' in comment:
         return 0
     if 'dead' in comment:
         HNComment.dead = comment['dead']
     HNComment.id = comment['id']
     HNComment.username = comment['by']
     HNComment.text = utils.html2markup(comment['text'])
     HNComment.story_id = self.story_id
     HNComment.parent = parent_object
     tz = get_localzone()
     HNComment.time = self.dateformat(comment['time'])
     HNComment.cache = timezone.now()
     HNComment.save()
     # models.HNCommentsCache(id=HNComment.id, time=timezone.now()).save()
     count = 0
     if 'kids' in comment:
         for comment_id in comment['kids']:
             count += 1
             comment_child = self.fetch.comments(comment_id)
             count += self.traverse_comments(comment_child, HNComment)
     return count

示例#3

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def traverse_comments(self, comment, parent_object=None):
     if not parent_object and not self.story:
         parent_object = self.parent(comment['parent'])
     HNComment = models.HNComments()
     if 'deleted' in comment:
         return 0
     if 'dead' in comment:
         HNComment.dead = comment['dead']
     HNComment.id = comment['id']
     HNComment.username = comment['by']
     HNComment.text = utils.html2markup(comment['text'])
     HNComment.story_id = self.story_id
     HNComment.parent = parent_object
     tz = get_localzone()
     HNComment.time = self.dateformat(comment['time'])
     HNComment.cache = timezone.now()
     HNComment.save()
     # models.HNCommentsCache(id=HNComment.id, time=timezone.now()).save()
     count = 0
     if 'kids' in comment:
         for comment_id in comment['kids']:
             count += 1
             comment_child = self.fetch.comments(comment_id)
             count += self.traverse_comments(comment_child, HNComment)
     return count

示例#4

0

显示文件

    def userpage(self, username):
        soup = fetch.userpage(username=username)
        try:
            i = 1
            if self.is_event(soup.html.body.table.find_all('table')[1]):
                i += 1
            userdata = soup.html.body.table.find_all('table')[i].find_all('tr')
        except AttributeError:
            raise CouldNotParse('Couldn\'t get userdata' + username)
        created = utils.parse_time(
            userdata[1].find_all('td')[1].decode_contents())
        try:
            avg = Decimal(userdata[3].find_all('td')[1].decode_contents())
        except InvalidOperation:
            avg = 0
        # If user is logged in there will be an editable textarea instead of just text
        if userdata[4].find_all('td')[1].textarea:
            about = userdata[4].find_all('td')[1].textarea.decode_contents()
        else:
            about = utils.html2markup(
                userdata[4].find_all('td')[1].decode_contents())

        UserInfo(username=username,
                 created=created,
                 karma=int(userdata[2].find_all('td')[1].decode_contents(),
                           10),
                 avg=avg,
                 about=about,
                 cache=timezone.now()).save()

示例#5

0

显示文件

文件： algolia.py 项目： duvholt/Hacker-News-Reader

 def poll_info(self, polls):
     for option in polls:
         poll = models.Poll(id=option['id'])
         poll.time = self.dateformat(option['created_at_i'])
         poll.name = utils.html2markup(option['text'])
         poll.score = option['points']
         poll.story_id = option['parent_id']
         poll.save()

示例#6

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def poll_info(self, polls):
     for part in polls:
         part = self.fetch.comments(part)
         poll = models.Poll(id=part['id'])
         poll.time = self.dateformat(part['time'])
         poll.name = utils.html2markup(part['text'])
         poll.score = part['score']
         poll.story_id = part['parent']
         poll.save()

示例#7

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def poll_info(self, polls):
     for part in polls:
         part = self.fetch.comments(part)
         poll = models.Poll(id=part['id'])
         poll.time = self.dateformat(part['time'])
         poll.name = utils.html2markup(part['text'])
         poll.score = part['score']
         poll.story_id = part['parent']
         poll.save()

示例#8

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def story_info(self, story):
     self.story = models.Stories()
     self.story.id = self.story_id
     self.story.cache = timezone.now()
     self.story.title = story['title']
     if story['text']:
         self.story.selfpost = True
         self.story.selfpost_text = utils.html2markup(story['text'])
     self.story.username = story['by']
     self.story.url = "" if 'url' not in story else story['url']
     self.story.time = self.dateformat(story['time'])
     self.story.score = story['score']

示例#9

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def story_info(self, story):
     self.story = models.Stories()
     self.story.id = self.story_id
     self.story.cache = timezone.now()
     self.story.title = story['title']
     if story['text']:
         self.story.selfpost = True
         self.story.selfpost_text = utils.html2markup(story['text'])
     self.story.username = story['by']
     self.story.url = "" if 'url' not in story else story['url']
     self.story.time = self.dateformat(story['time'])
     self.story.score = story['score']

示例#10

0

显示文件

文件： algolia.py 项目： duvholt/Hacker-News-Reader

 def story_info(self, story):
     self.story = models.Stories()
     self.story.id = self.story_id
     self.story.cache = timezone.now()
     self.story.title = story['title']
     if story['text']:
         self.story.selfpost = True
         self.story.selfpost_text = utils.html2markup(story['text'])
     self.story.username = story['author']
     self.story.url = "" if story['url'] is None else story['url']
     self.story.time = self.dateformat(story['created_at'])
     self.story.score = story['points']

示例#11

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def userpage(self, username):
     userpage = self.fetch.userpage(username)
     if not userpage:
         # User doesn't e
         return
     user = models.UserInfo()
     user.username = userpage['id']
     user.created = self.dateformat(userpage['created'])
     user.karma = userpage['karma']
     # user.avg = userpage['avg']
     if userpage['about']:
         user.about = utils.html2markup(userpage['about'])
     else:
         user.about = None
     user.cache = timezone.now()
     user.save()

示例#12

0

显示文件

文件： firebase.py 项目： duvholt/Hacker-News-Reader

 def userpage(self, username):
     userpage = self.fetch.userpage(username)
     if not userpage:
         # User doesn't e
         return
     user = models.UserInfo()
     user.username = userpage['id']
     user.created = self.dateformat(userpage['created'])
     user.karma = userpage['karma']
     # user.avg = userpage['avg']
     if userpage['about']:
         user.about = utils.html2markup(userpage['about'])
     else:
         user.about = None
     user.cache = timezone.now()
     user.save()

示例#13

0

显示文件

文件： algolia.py 项目： duvholt/Hacker-News-Reader

 def userpage(self, username):
     userpage = self.fetch.userpage(username)
     user = models.UserInfo()
     if 'message' in userpage:
         raise utils.ShowAlert(userpage['message'])
     if 'status' in userpage:
         raise utils.ShowAlert('Failed to retrieve user information')
     user.username = userpage['username']
     user.created = self.dateformat(userpage['created_at'])
     user.karma = userpage['karma']
     user.avg = userpage['avg']
     if userpage['about']:
         user.about = utils.html2markup(userpage['about'])
     else:
         user.about = None
     user.cache = timezone.now()
     user.save()

示例#14

0

显示文件

def userpage(username):
    soup = Fetch.userpage(username=username)
    try:
        userdata = soup.html.body.table.find_all('table')[1].find_all('tr')
    except AttributeError:
        raise CouldNotParse('Couldn\'t get userdata' + username)
    created = utils.parse_time(userdata[1].find_all('td')[1].decode_contents())
    try:
        avg = Decimal(userdata[3].find_all('td')[1].decode_contents())
    except InvalidOperation:
        avg = 0
    UserInfo(username=username,
             created=created,
             karma=int(userdata[2].find_all('td')[1].decode_contents(), 10),
             avg=avg,
             about=utils.html2markup(
                 userdata[4].find_all('td')[1].decode_contents()),
             cache=timezone.now()).save()

示例#15

0

显示文件

文件： hnparse.py 项目： manpreetnarang/Hacker-News-Reader

def userpage(username):
	soup = Fetch.userpage(username=username)
	try:
		userdata = soup.html.body.table.find_all('table')[1].find_all('tr')
	except AttributeError:
		raise CouldNotParse('Couldn\'t get userdata' + username)
	created = utils.parse_time(userdata[1].find_all('td')[1].decode_contents())
	try:
		avg = Decimal(userdata[3].find_all('td')[1].decode_contents())
	except InvalidOperation:
		avg = 0
	UserInfo(
		username=username,
		created=created,
		karma=int(userdata[2].find_all('td')[1].decode_contents(), 10),
		avg=avg,
		about=utils.html2markup(userdata[4].find_all('td')[1].decode_contents()),
		cache=timezone.now()
	).save()

示例#16

0

显示文件

文件： algolia.py 项目： duvholt/Hacker-News-Reader

 def traverse_comments(self, comment, parent_object=None):
     if not parent_object and not self.story:
         parent_object = self.parent(comment['parent_id'])
     if 'author' not in comment:
         # Dead comment with no info
         return 0
     HNComment = models.HNComments()
     HNComment.id = comment['id']
     HNComment.username = comment['author']
     HNComment.text = utils.html2markup(comment['text'])
     HNComment.story_id = self.story_id
     HNComment.parent = parent_object
     tz = get_localzone()
     HNComment.time = self.dateformat(comment['created_at'])
     HNComment.cache = timezone.now()
     HNComment.save()
     # models.HNCommentsCache(id=HNComment.id, time=timezone.now()).save()
     count = 0
     for comment_child in comment['children']:
         count += 1
         count += self.traverse_comments(comment_child, HNComment)
     return count

示例#17

0

显示文件

def traverse_comment(comment_soup, parent_object, story_id, perma=False):
    comment = HNComments()
    # Comment <td> container shortcut
    td_default = comment_soup.tr.find('td', {'class': 'default'})
    # Retrieving comment id from the permalink
    try:
        comment.id = int(
            re.search(r'item\?id=(\d+)$',
                      td_default.find_all('a')[1]['href']).group(1), 10)
    except IndexError:
        raise CouldNotParse('Comment is dead')
    comment.username = td_default.find('a').find(text=True)
    # Get html contents of the comment excluding <span> and <font>
    comment.text = utils.html2markup(
        td_default.find('span', {
            'class': 'comment'
        }).font.decode_contents())
    hex_color = td_default.find('span', {'class': 'comment'}).font['color']
    # All colors are in the format of #XYXYXY, meaning that they are all grayscale.
    # Get percent by grabbing the red part of the color (#XY)
    comment.hiddenpercent = int(
        re.search(r'^#(\w{2})', hex_color).group(1), 16) / 2.5
    comment.hiddencolor = hex_color
    comment.time = utils.parse_time(td_default.find('a').next_sibling + ' ago')
    # parsedatetime doesn't have any built in support for DST
    if time.localtime().tm_isdst == 1:
        comment.time = comment.time + datetime.timedelta(hours=-1)
    # Some extra trickery for permalinked comments
    if perma:
        parent_id = int(
            re.search(r'item\?id=(\d+)$',
                      td_default.find_all('a')[2]['href']).group(1), 10)
        try:
            # Checking if the parent object is in the db
            parent_object = HNComments.objects.get(pk=parent_id)
            story_id = parent_object.story_id
        except HNComments.DoesNotExist:
            parent_object = None
            # story_id is at this moment actually comment id of the parent object.
            # Trying to correct this by checking for actualy story_id in the db
            try:
                story_id = HNComments.objects.get(pk=story_id).story_id
            except HNComments.DoesNotExist:
                # Oops, looks like we'll just store a fake one for now
                pass
    comment.story_id = story_id
    comment.cache = timezone.now()
    comment.parent = parent_object
    if perma and not parent_object and parent_id:
        # Forcing comment to be updated next time, since it doesn't have proper values
        cache = timezone.now() - datetime.timedelta(days=1)
        parent_object = HNComments(id=parent_id,
                                   username='',
                                   parent=None,
                                   cache=cache)
        parent_object.save()
        comment.parent = parent_object
    comment.save()
    HNCommentsCache(id=comment.id, time=timezone.now()).save()

    # Traversing over child comments:
    # Since comments aren't actually children in the HTML we will have to parse all the siblings
    # and check if they have +1 indent indicating that they are a child.
    # However if a following comment has the same indent value it is not a child and neither a sub child
    # meaning that all child comments have been parsed.
    if not perma:
        indenting = int(
            td_default.previous_sibling.previous_sibling.img['width'], 10) / 40
        for sibling_soup in comment_soup.parent.parent.find_next_siblings(
                'tr'):
            sibling_table = sibling_soup.table
            # Comment pages with a "More" link at the bottom will have two extra trs without a table
            if sibling_table:
                sibling_td_default = sibling_table.tr.find(
                    'td', {'class': 'default'})
                sibling_indenting = int(
                    sibling_td_default.previous_sibling.previous_sibling.
                    img['width'], 10) / 40
                if sibling_indenting == indenting + 1:
                    try:
                        traverse_comment(sibling_table, comment, story_id)
                    except CouldNotParse:
                        continue
                if sibling_indenting == indenting:
                    break
            elif sibling_soup.find('td', {'class': 'title'}):
                # TODO Add support for loading more comments
                continue

示例#18

0

显示文件

def comments(commentid, cache_minutes=20):
    start_time = timezone.now()
    soup = Fetch.comments(commentid=commentid)
    try:
        story_soup = soup.html.body.table.find_all('table')[1].find('tr')
    except AttributeError:
        # Story does not exist
        raise CouldNotParse('Story not found: ' + str(commentid))
    if story_soup.findNext('tr').find('td', {'class': 'subtext'}):
        # Updating story info
        try:
            story = story_info(story_soup)
        except CouldNotParse:
            raise utils.ShowAlert('Story or comment deleted')
        parent_object = None
        permalink = False
        story_id = commentid
    else:
        # For permalinked comments
        try:
            # If comment already is in db get the info
            parent_object = HNComments.objects.get(id=commentid)
            if parent_object.cache + datetime.timedelta(
                    minutes=cache_minutes) < timezone.now():
                try:
                    traverse_comment(story_soup.parent,
                                     parent_object.parent,
                                     parent_object.story_id,
                                     perma=True)
                except CouldNotParse:
                    pass
                parent_object = HNComments.objects.get(id=commentid)
        except HNComments.DoesNotExist:
            # Since the comment doesn't exist we have to improvise with the data a bit
            # Story is is not provided for permalinked comments, but parent id is
            # Story id will therefore temporarely be set to the comment id
            try:
                traverse_comment(story_soup.parent,
                                 None,
                                 commentid,
                                 perma=True)
            except CouldNotParse:
                return
            parent_object = HNComments.objects.get(id=commentid)
        story_id = parent_object.story_id
        permalink = True
        story = None
    poll = False
    if story:
        poll_table = story_soup.parent.find('table')
        if poll_table:
            poll = True
            poll_update(story.id, poll_table)
            story.poll = True
        selfpost_info = story_soup.parent.find_all('tr',
                                                   {'style': 'height:2px'})
        if selfpost_info:
            story.selfpost_text = utils.html2markup(
                selfpost_info[0].next_sibling.find_all(
                    'td')[1].decode_contents())
        else:
            story.selfpost_text = ''
        story.save()
    if story or permalink:
        # Updating cache
        HNCommentsCache(id=commentid, time=timezone.now()).save()
        # If there is a poll there will be an extra table before comments
        i = 2
        if poll:
            i += 1
        # Traversing all top comments
        comments_soup = soup.html.body.table.find_all('table')[i].find_all(
            'table')
        for comment_soup in comments_soup:
            td_default = comment_soup.tr.find('td', {'class': 'default'})
            # Converting indent to a more readable format (0, 1, 2...)
            indenting = int(
                td_default.previous_sibling.previous_sibling.img['width'],
                10) / 40
            if indenting == 0:
                try:
                    traverse_comment(comment_soup, parent_object, story_id)
                except CouldNotParse:
                    continue
        HNComments.objects.filter(cache__lt=start_time,
                                  story_id=commentid).update(dead=True)

示例#19

0

显示文件

文件： hnparse.py 项目： manpreetnarang/Hacker-News-Reader

def traverse_comment(comment_soup, parent_object, story_id, perma=False):
	comment = HNComments()
	# Comment <td> container shortcut
	td_default = comment_soup.tr.find('td', {'class': 'default'})
	# Retrieving comment id from the permalink
	try:
		comment.id = int(re.search(r'item\?id=(\d+)$', td_default.find_all('a')[1]['href']).group(1), 10)
	except IndexError:
		raise CouldNotParse('Comment is dead')
	comment.username = td_default.find('a').find(text=True)
	# Get html contents of the comment excluding <span> and <font>
	comment.text = utils.html2markup(td_default.find('span', {'class': 'comment'}).font.decode_contents())
	hex_color = td_default.find('span', {'class': 'comment'}).font['color']
	# All colors are in the format of #XYXYXY, meaning that they are all grayscale.
	# Get percent by grabbing the red part of the color (#XY)
	comment.hiddenpercent = int(re.search(r'^#(\w{2})', hex_color).group(1), 16) / 2.5
	comment.hiddencolor = hex_color
	comment.time = utils.parse_time(td_default.find('a').next_sibling + ' ago')
	# parsedatetime doesn't have any built in support for DST
	if time.localtime().tm_isdst == 1:
		comment.time = comment.time + datetime.timedelta(hours=-1)
	# Some extra trickery for permalinked comments
	if perma:
		parent_id = int(re.search(r'item\?id=(\d+)$', td_default.find_all('a')[2]['href']).group(1), 10)
		try:
			# Checking if the parent object is in the db
			parent_object = HNComments.objects.get(pk=parent_id)
			story_id = parent_object.story_id
		except HNComments.DoesNotExist:
			parent_object = None
			# story_id is at this moment actually comment id of the parent object.
			# Trying to correct this by checking for actualy story_id in the db
			try:
				story_id = HNComments.objects.get(pk=story_id).story_id
			except HNComments.DoesNotExist:
				# Oops, looks like we'll just store a fake one for now
				pass
	comment.story_id = story_id
	comment.cache = timezone.now()
	comment.parent = parent_object
	if perma and not parent_object and parent_id:
		# Forcing comment to be updated next time, since it doesn't have proper values
		cache = timezone.now() - datetime.timedelta(days=1)
		parent_object = HNComments(id=parent_id, username='', parent=None, cache=cache)
		parent_object.save()
		comment.parent = parent_object
	comment.save()
	HNCommentsCache(id=comment.id, time=timezone.now()).save()

	# Traversing over child comments:
	# Since comments aren't actually children in the HTML we will have to parse all the siblings
	# and check if they have +1 indent indicating that they are a child.
	# However if a following comment has the same indent value it is not a child and neither a sub child
	# meaning that all child comments have been parsed.
	if not perma:
		indenting = int(td_default.previous_sibling.previous_sibling.img['width'], 10) / 40
		for sibling_soup in comment_soup.parent.parent.find_next_siblings('tr'):
			sibling_table = sibling_soup.table
			# Comment pages with a "More" link at the bottom will have two extra trs without a table
			if sibling_table:
				sibling_td_default = sibling_table.tr.find('td', {'class': 'default'})
				sibling_indenting = int(sibling_td_default.previous_sibling.previous_sibling.img['width'], 10) / 40
				if sibling_indenting == indenting + 1:
					try:
						traverse_comment(sibling_table, comment, story_id)
					except CouldNotParse:
						continue
				if sibling_indenting == indenting:
					break
			elif sibling_soup.find('td', {'class': 'title'}):
				# TODO Add support for loading more comments
				continue

示例#20

0

显示文件

文件： hnparse.py 项目： manpreetnarang/Hacker-News-Reader

def comments(commentid, cache_minutes=20):
	start_time = timezone.now()
	soup = Fetch.comments(commentid=commentid)
	try:
		story_soup = soup.html.body.table.find_all('table')[1].find('tr')
	except AttributeError:
		# Story does not exist
		raise CouldNotParse('Story not found: ' + str(commentid))
	if story_soup.findNext('tr').find('td', {'class': 'subtext'}):
		# Updating story info
		try:
			story = story_info(story_soup)
		except CouldNotParse:
			raise utils.ShowAlert('Story or comment deleted')
		parent_object = None
		permalink = False
		story_id = commentid
	else:
		# For permalinked comments
		try:
			# If comment already is in db get the info
			parent_object = HNComments.objects.get(id=commentid)
			if parent_object.cache + datetime.timedelta(minutes=cache_minutes) < timezone.now():
				try:
					traverse_comment(story_soup.parent, parent_object.parent, parent_object.story_id, perma=True)
				except CouldNotParse:
					pass
				parent_object = HNComments.objects.get(id=commentid)
		except HNComments.DoesNotExist:
			# Since the comment doesn't exist we have to improvise with the data a bit
			# Story is is not provided for permalinked comments, but parent id is
			# Story id will therefore temporarely be set to the comment id
			try:
				traverse_comment(story_soup.parent, None, commentid, perma=True)
			except CouldNotParse:
				return
			parent_object = HNComments.objects.get(id=commentid)
		story_id = parent_object.story_id
		permalink = True
		story = None
	poll = False
	if story:
		poll_table = story_soup.parent.find('table')
		if poll_table:
			poll = True
			poll_update(story.id, poll_table)
			story.poll = True
		selfpost_info = story_soup.parent.find_all('tr', {'style': 'height:2px'})
		if selfpost_info:
			story.selfpost_text = utils.html2markup(selfpost_info[0].next_sibling.find_all('td')[1].decode_contents())
		else:
			story.selfpost_text = ''
		story.save()
	if story or permalink:
		# Updating cache
		HNCommentsCache(id=commentid, time=timezone.now()).save()
		# If there is a poll there will be an extra table before comments
		i = 2
		if poll:
			i += 1
		# Traversing all top comments
		comments_soup = soup.html.body.table.find_all('table')[i].find_all('table')
		for comment_soup in comments_soup:
			td_default = comment_soup.tr.find('td', {'class': 'default'})
			# Converting indent to a more readable format (0, 1, 2...)
			indenting = int(td_default.previous_sibling.previous_sibling.img['width'], 10) / 40
			if indenting == 0:
				try:
					traverse_comment(comment_soup, parent_object, story_id)
				except CouldNotParse:
					continue
		HNComments.objects.filter(cache__lt=start_time, story_id=commentid).update(dead=True)