def userpage(self, username): soup = fetch.userpage(username=username) try: i = 1 if self.is_event(soup.html.body.table.find_all('table')[1]): i += 1 userdata = soup.html.body.table.find_all('table')[i].find_all('tr') except AttributeError: raise CouldNotParse('Couldn\'t get userdata' + username) created = utils.parse_time(userdata[1].find_all('td')[1].decode_contents()) try: avg = Decimal(userdata[3].find_all('td')[1].decode_contents()) except InvalidOperation: avg = 0 # If user is logged in there will be an editable textarea instead of just text if userdata[4].find_all('td')[1].textarea: about = userdata[4].find_all('td')[1].textarea.decode_contents() else: about = utils.html2markup(userdata[4].find_all('td')[1].decode_contents()) UserInfo( username=username, created=created, karma=int(userdata[2].find_all('td')[1].decode_contents(), 10), avg=avg, about=about, cache=timezone.now() ).save()
def traverse_comments(self, comment, parent_object=None): if not parent_object and not self.story: parent_object = self.parent(comment['parent']) HNComment = models.HNComments() if 'deleted' in comment: return 0 if 'dead' in comment: HNComment.dead = comment['dead'] HNComment.id = comment['id'] HNComment.username = comment['by'] HNComment.text = utils.html2markup(comment['text']) HNComment.story_id = self.story_id HNComment.parent = parent_object tz = get_localzone() HNComment.time = self.dateformat(comment['time']) HNComment.cache = timezone.now() HNComment.save() # models.HNCommentsCache(id=HNComment.id, time=timezone.now()).save() count = 0 if 'kids' in comment: for comment_id in comment['kids']: count += 1 comment_child = self.fetch.comments(comment_id) count += self.traverse_comments(comment_child, HNComment) return count
def userpage(self, username): soup = fetch.userpage(username=username) try: i = 1 if self.is_event(soup.html.body.table.find_all('table')[1]): i += 1 userdata = soup.html.body.table.find_all('table')[i].find_all('tr') except AttributeError: raise CouldNotParse('Couldn\'t get userdata' + username) created = utils.parse_time( userdata[1].find_all('td')[1].decode_contents()) try: avg = Decimal(userdata[3].find_all('td')[1].decode_contents()) except InvalidOperation: avg = 0 # If user is logged in there will be an editable textarea instead of just text if userdata[4].find_all('td')[1].textarea: about = userdata[4].find_all('td')[1].textarea.decode_contents() else: about = utils.html2markup( userdata[4].find_all('td')[1].decode_contents()) UserInfo(username=username, created=created, karma=int(userdata[2].find_all('td')[1].decode_contents(), 10), avg=avg, about=about, cache=timezone.now()).save()
def poll_info(self, polls): for option in polls: poll = models.Poll(id=option['id']) poll.time = self.dateformat(option['created_at_i']) poll.name = utils.html2markup(option['text']) poll.score = option['points'] poll.story_id = option['parent_id'] poll.save()
def poll_info(self, polls): for part in polls: part = self.fetch.comments(part) poll = models.Poll(id=part['id']) poll.time = self.dateformat(part['time']) poll.name = utils.html2markup(part['text']) poll.score = part['score'] poll.story_id = part['parent'] poll.save()
def story_info(self, story): self.story = models.Stories() self.story.id = self.story_id self.story.cache = timezone.now() self.story.title = story['title'] if story['text']: self.story.selfpost = True self.story.selfpost_text = utils.html2markup(story['text']) self.story.username = story['by'] self.story.url = "" if 'url' not in story else story['url'] self.story.time = self.dateformat(story['time']) self.story.score = story['score']
def story_info(self, story): self.story = models.Stories() self.story.id = self.story_id self.story.cache = timezone.now() self.story.title = story['title'] if story['text']: self.story.selfpost = True self.story.selfpost_text = utils.html2markup(story['text']) self.story.username = story['author'] self.story.url = "" if story['url'] is None else story['url'] self.story.time = self.dateformat(story['created_at']) self.story.score = story['points']
def userpage(self, username): userpage = self.fetch.userpage(username) if not userpage: # User doesn't e return user = models.UserInfo() user.username = userpage['id'] user.created = self.dateformat(userpage['created']) user.karma = userpage['karma'] # user.avg = userpage['avg'] if userpage['about']: user.about = utils.html2markup(userpage['about']) else: user.about = None user.cache = timezone.now() user.save()
def userpage(self, username): userpage = self.fetch.userpage(username) user = models.UserInfo() if 'message' in userpage: raise utils.ShowAlert(userpage['message']) if 'status' in userpage: raise utils.ShowAlert('Failed to retrieve user information') user.username = userpage['username'] user.created = self.dateformat(userpage['created_at']) user.karma = userpage['karma'] user.avg = userpage['avg'] if userpage['about']: user.about = utils.html2markup(userpage['about']) else: user.about = None user.cache = timezone.now() user.save()
def userpage(username): soup = Fetch.userpage(username=username) try: userdata = soup.html.body.table.find_all('table')[1].find_all('tr') except AttributeError: raise CouldNotParse('Couldn\'t get userdata' + username) created = utils.parse_time(userdata[1].find_all('td')[1].decode_contents()) try: avg = Decimal(userdata[3].find_all('td')[1].decode_contents()) except InvalidOperation: avg = 0 UserInfo(username=username, created=created, karma=int(userdata[2].find_all('td')[1].decode_contents(), 10), avg=avg, about=utils.html2markup( userdata[4].find_all('td')[1].decode_contents()), cache=timezone.now()).save()
def userpage(username): soup = Fetch.userpage(username=username) try: userdata = soup.html.body.table.find_all('table')[1].find_all('tr') except AttributeError: raise CouldNotParse('Couldn\'t get userdata' + username) created = utils.parse_time(userdata[1].find_all('td')[1].decode_contents()) try: avg = Decimal(userdata[3].find_all('td')[1].decode_contents()) except InvalidOperation: avg = 0 UserInfo( username=username, created=created, karma=int(userdata[2].find_all('td')[1].decode_contents(), 10), avg=avg, about=utils.html2markup(userdata[4].find_all('td')[1].decode_contents()), cache=timezone.now() ).save()
def traverse_comments(self, comment, parent_object=None): if not parent_object and not self.story: parent_object = self.parent(comment['parent_id']) if 'author' not in comment: # Dead comment with no info return 0 HNComment = models.HNComments() HNComment.id = comment['id'] HNComment.username = comment['author'] HNComment.text = utils.html2markup(comment['text']) HNComment.story_id = self.story_id HNComment.parent = parent_object tz = get_localzone() HNComment.time = self.dateformat(comment['created_at']) HNComment.cache = timezone.now() HNComment.save() # models.HNCommentsCache(id=HNComment.id, time=timezone.now()).save() count = 0 for comment_child in comment['children']: count += 1 count += self.traverse_comments(comment_child, HNComment) return count
def traverse_comment(comment_soup, parent_object, story_id, perma=False): comment = HNComments() # Comment <td> container shortcut td_default = comment_soup.tr.find('td', {'class': 'default'}) # Retrieving comment id from the permalink try: comment.id = int( re.search(r'item\?id=(\d+)$', td_default.find_all('a')[1]['href']).group(1), 10) except IndexError: raise CouldNotParse('Comment is dead') comment.username = td_default.find('a').find(text=True) # Get html contents of the comment excluding <span> and <font> comment.text = utils.html2markup( td_default.find('span', { 'class': 'comment' }).font.decode_contents()) hex_color = td_default.find('span', {'class': 'comment'}).font['color'] # All colors are in the format of #XYXYXY, meaning that they are all grayscale. # Get percent by grabbing the red part of the color (#XY) comment.hiddenpercent = int( re.search(r'^#(\w{2})', hex_color).group(1), 16) / 2.5 comment.hiddencolor = hex_color comment.time = utils.parse_time(td_default.find('a').next_sibling + ' ago') # parsedatetime doesn't have any built in support for DST if time.localtime().tm_isdst == 1: comment.time = comment.time + datetime.timedelta(hours=-1) # Some extra trickery for permalinked comments if perma: parent_id = int( re.search(r'item\?id=(\d+)$', td_default.find_all('a')[2]['href']).group(1), 10) try: # Checking if the parent object is in the db parent_object = HNComments.objects.get(pk=parent_id) story_id = parent_object.story_id except HNComments.DoesNotExist: parent_object = None # story_id is at this moment actually comment id of the parent object. # Trying to correct this by checking for actualy story_id in the db try: story_id = HNComments.objects.get(pk=story_id).story_id except HNComments.DoesNotExist: # Oops, looks like we'll just store a fake one for now pass comment.story_id = story_id comment.cache = timezone.now() comment.parent = parent_object if perma and not parent_object and parent_id: # Forcing comment to be updated next time, since it doesn't have proper values cache = timezone.now() - datetime.timedelta(days=1) parent_object = HNComments(id=parent_id, username='', parent=None, cache=cache) parent_object.save() comment.parent = parent_object comment.save() HNCommentsCache(id=comment.id, time=timezone.now()).save() # Traversing over child comments: # Since comments aren't actually children in the HTML we will have to parse all the siblings # and check if they have +1 indent indicating that they are a child. # However if a following comment has the same indent value it is not a child and neither a sub child # meaning that all child comments have been parsed. if not perma: indenting = int( td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 for sibling_soup in comment_soup.parent.parent.find_next_siblings( 'tr'): sibling_table = sibling_soup.table # Comment pages with a "More" link at the bottom will have two extra trs without a table if sibling_table: sibling_td_default = sibling_table.tr.find( 'td', {'class': 'default'}) sibling_indenting = int( sibling_td_default.previous_sibling.previous_sibling. img['width'], 10) / 40 if sibling_indenting == indenting + 1: try: traverse_comment(sibling_table, comment, story_id) except CouldNotParse: continue if sibling_indenting == indenting: break elif sibling_soup.find('td', {'class': 'title'}): # TODO Add support for loading more comments continue
def comments(commentid, cache_minutes=20): start_time = timezone.now() soup = Fetch.comments(commentid=commentid) try: story_soup = soup.html.body.table.find_all('table')[1].find('tr') except AttributeError: # Story does not exist raise CouldNotParse('Story not found: ' + str(commentid)) if story_soup.findNext('tr').find('td', {'class': 'subtext'}): # Updating story info try: story = story_info(story_soup) except CouldNotParse: raise utils.ShowAlert('Story or comment deleted') parent_object = None permalink = False story_id = commentid else: # For permalinked comments try: # If comment already is in db get the info parent_object = HNComments.objects.get(id=commentid) if parent_object.cache + datetime.timedelta( minutes=cache_minutes) < timezone.now(): try: traverse_comment(story_soup.parent, parent_object.parent, parent_object.story_id, perma=True) except CouldNotParse: pass parent_object = HNComments.objects.get(id=commentid) except HNComments.DoesNotExist: # Since the comment doesn't exist we have to improvise with the data a bit # Story is is not provided for permalinked comments, but parent id is # Story id will therefore temporarely be set to the comment id try: traverse_comment(story_soup.parent, None, commentid, perma=True) except CouldNotParse: return parent_object = HNComments.objects.get(id=commentid) story_id = parent_object.story_id permalink = True story = None poll = False if story: poll_table = story_soup.parent.find('table') if poll_table: poll = True poll_update(story.id, poll_table) story.poll = True selfpost_info = story_soup.parent.find_all('tr', {'style': 'height:2px'}) if selfpost_info: story.selfpost_text = utils.html2markup( selfpost_info[0].next_sibling.find_all( 'td')[1].decode_contents()) else: story.selfpost_text = '' story.save() if story or permalink: # Updating cache HNCommentsCache(id=commentid, time=timezone.now()).save() # If there is a poll there will be an extra table before comments i = 2 if poll: i += 1 # Traversing all top comments comments_soup = soup.html.body.table.find_all('table')[i].find_all( 'table') for comment_soup in comments_soup: td_default = comment_soup.tr.find('td', {'class': 'default'}) # Converting indent to a more readable format (0, 1, 2...) indenting = int( td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 if indenting == 0: try: traverse_comment(comment_soup, parent_object, story_id) except CouldNotParse: continue HNComments.objects.filter(cache__lt=start_time, story_id=commentid).update(dead=True)
def traverse_comment(comment_soup, parent_object, story_id, perma=False): comment = HNComments() # Comment <td> container shortcut td_default = comment_soup.tr.find('td', {'class': 'default'}) # Retrieving comment id from the permalink try: comment.id = int(re.search(r'item\?id=(\d+)$', td_default.find_all('a')[1]['href']).group(1), 10) except IndexError: raise CouldNotParse('Comment is dead') comment.username = td_default.find('a').find(text=True) # Get html contents of the comment excluding <span> and <font> comment.text = utils.html2markup(td_default.find('span', {'class': 'comment'}).font.decode_contents()) hex_color = td_default.find('span', {'class': 'comment'}).font['color'] # All colors are in the format of #XYXYXY, meaning that they are all grayscale. # Get percent by grabbing the red part of the color (#XY) comment.hiddenpercent = int(re.search(r'^#(\w{2})', hex_color).group(1), 16) / 2.5 comment.hiddencolor = hex_color comment.time = utils.parse_time(td_default.find('a').next_sibling + ' ago') # parsedatetime doesn't have any built in support for DST if time.localtime().tm_isdst == 1: comment.time = comment.time + datetime.timedelta(hours=-1) # Some extra trickery for permalinked comments if perma: parent_id = int(re.search(r'item\?id=(\d+)$', td_default.find_all('a')[2]['href']).group(1), 10) try: # Checking if the parent object is in the db parent_object = HNComments.objects.get(pk=parent_id) story_id = parent_object.story_id except HNComments.DoesNotExist: parent_object = None # story_id is at this moment actually comment id of the parent object. # Trying to correct this by checking for actualy story_id in the db try: story_id = HNComments.objects.get(pk=story_id).story_id except HNComments.DoesNotExist: # Oops, looks like we'll just store a fake one for now pass comment.story_id = story_id comment.cache = timezone.now() comment.parent = parent_object if perma and not parent_object and parent_id: # Forcing comment to be updated next time, since it doesn't have proper values cache = timezone.now() - datetime.timedelta(days=1) parent_object = HNComments(id=parent_id, username='', parent=None, cache=cache) parent_object.save() comment.parent = parent_object comment.save() HNCommentsCache(id=comment.id, time=timezone.now()).save() # Traversing over child comments: # Since comments aren't actually children in the HTML we will have to parse all the siblings # and check if they have +1 indent indicating that they are a child. # However if a following comment has the same indent value it is not a child and neither a sub child # meaning that all child comments have been parsed. if not perma: indenting = int(td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 for sibling_soup in comment_soup.parent.parent.find_next_siblings('tr'): sibling_table = sibling_soup.table # Comment pages with a "More" link at the bottom will have two extra trs without a table if sibling_table: sibling_td_default = sibling_table.tr.find('td', {'class': 'default'}) sibling_indenting = int(sibling_td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 if sibling_indenting == indenting + 1: try: traverse_comment(sibling_table, comment, story_id) except CouldNotParse: continue if sibling_indenting == indenting: break elif sibling_soup.find('td', {'class': 'title'}): # TODO Add support for loading more comments continue
def comments(commentid, cache_minutes=20): start_time = timezone.now() soup = Fetch.comments(commentid=commentid) try: story_soup = soup.html.body.table.find_all('table')[1].find('tr') except AttributeError: # Story does not exist raise CouldNotParse('Story not found: ' + str(commentid)) if story_soup.findNext('tr').find('td', {'class': 'subtext'}): # Updating story info try: story = story_info(story_soup) except CouldNotParse: raise utils.ShowAlert('Story or comment deleted') parent_object = None permalink = False story_id = commentid else: # For permalinked comments try: # If comment already is in db get the info parent_object = HNComments.objects.get(id=commentid) if parent_object.cache + datetime.timedelta(minutes=cache_minutes) < timezone.now(): try: traverse_comment(story_soup.parent, parent_object.parent, parent_object.story_id, perma=True) except CouldNotParse: pass parent_object = HNComments.objects.get(id=commentid) except HNComments.DoesNotExist: # Since the comment doesn't exist we have to improvise with the data a bit # Story is is not provided for permalinked comments, but parent id is # Story id will therefore temporarely be set to the comment id try: traverse_comment(story_soup.parent, None, commentid, perma=True) except CouldNotParse: return parent_object = HNComments.objects.get(id=commentid) story_id = parent_object.story_id permalink = True story = None poll = False if story: poll_table = story_soup.parent.find('table') if poll_table: poll = True poll_update(story.id, poll_table) story.poll = True selfpost_info = story_soup.parent.find_all('tr', {'style': 'height:2px'}) if selfpost_info: story.selfpost_text = utils.html2markup(selfpost_info[0].next_sibling.find_all('td')[1].decode_contents()) else: story.selfpost_text = '' story.save() if story or permalink: # Updating cache HNCommentsCache(id=commentid, time=timezone.now()).save() # If there is a poll there will be an extra table before comments i = 2 if poll: i += 1 # Traversing all top comments comments_soup = soup.html.body.table.find_all('table')[i].find_all('table') for comment_soup in comments_soup: td_default = comment_soup.tr.find('td', {'class': 'default'}) # Converting indent to a more readable format (0, 1, 2...) indenting = int(td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 if indenting == 0: try: traverse_comment(comment_soup, parent_object, story_id) except CouldNotParse: continue HNComments.objects.filter(cache__lt=start_time, story_id=commentid).update(dead=True)