def story_info(self, story_soup): if not story_soup.find('td'): raise CouldNotParse title = story_soup('td', {'class': 'title'})[-1] subtext = story_soup.find_next('tr').find('td', {'class': 'subtext'}) # Dead post if not subtext.find_all('a'): raise CouldNotParse story = Stories() if title.next == ' [dead] ': story.dead = True story.url = '' else: story.url = unquote(title.find('a')['href']) story.title = title.find('a').contents[0] # Check for domain class if title.find('span', {'class': 'comhead'}): story.selfpost = False else: # No domain provided, must be a selfpost story.selfpost = True story.url = '' story.score = int( re.search(r'(\d+) points?', unicode(subtext.find("span"))).group(1)) story.username = subtext.find('a').find(text=True) try: story.comments = int( re.search(r'(\d+) comments?', unicode(subtext.find_all("a")[1])).group(1)) except AttributeError: # Comments are not always shown (old submissions or ones with 0 comments) story.comments = 0 # Unfortunalely HN doesn't show any form timestamp other than "x hours" # meaning that the time scraped is only approximately correct. story.time = utils.parse_time( subtext.find_all("a")[1].previous_sibling + ' ago') # parsedatetime doesn't have any built in support for DST if time.localtime().tm_isdst: story.time = story.time + datetime.timedelta(hours=-1) story.id = int( re.search('item\?id=(\d+)$', subtext.find_all('a')[1]['href']).group(1)) story.cache = timezone.now() username = get_request().session.get('username', None) if username: # Adding vote auth to session userdata = get_request().session.setdefault('userdata', {}).setdefault( username, {}) try: auth_code = re.search(r'&auth=([a-z0-9]+)&whence', story_soup.a['href']).group(1) except (TypeError, AttributeError): auth_code = None userdata.setdefault('votes', {})[str(story.id)] = auth_code get_request().session.modified = True return story
def story_info(self, story_soup): if not story_soup.find('td'): raise CouldNotParse title = story_soup('td', {'class': 'title'})[-1] subtext = story_soup.find_next('tr').find('td', {'class': 'subtext'}) # Dead post if not subtext.find_all('a'): raise CouldNotParse story = Stories() if title.next == ' [dead] ': story.dead = True story.url = '' else: story.url = unquote(title.find('a')['href']) story.title = title.find('a').contents[0] # Check for domain class if title.find('span', {'class': 'comhead'}): story.selfpost = False else: # No domain provided, must be a selfpost story.selfpost = True story.url = '' story.score = int(re.search(r'(\d+) points?', unicode(subtext.find("span"))).group(1)) story.username = subtext.find('a').find(text=True) try: story.comments = int(re.search(r'(\d+) comments?', unicode(subtext.find_all("a")[1])).group(1)) except AttributeError: # Comments are not always shown (old submissions or ones with 0 comments) story.comments = 0 # Unfortunalely HN doesn't show any form timestamp other than "x hours" # meaning that the time scraped is only approximately correct. story.time = utils.parse_time(subtext.find_all("a")[1].previous_sibling + ' ago') # parsedatetime doesn't have any built in support for DST if time.localtime().tm_isdst: story.time = story.time + datetime.timedelta(hours=-1) story.id = int(re.search('item\?id=(\d+)$', subtext.find_all('a')[1]['href']).group(1)) story.cache = timezone.now() username = get_request().session.get('username', None) if username: # Adding vote auth to session userdata = get_request().session.setdefault('userdata', {}).setdefault(username, {}) try: auth_code = re.search(r'&auth=([a-z0-9]+)&whence', story_soup.a['href']).group(1) except (TypeError, AttributeError): auth_code = None userdata.setdefault('votes', {})[str(story.id)] = auth_code get_request().session.modified = True return story
def traverse_comment(self, comment_soup, parent_object, story_id, perma=False): comment = HNComments() # Comment <td> container shortcut td_default = comment_soup.tr.find('td', {'class': 'default'}) # Retrieving comment id from the permalink try: comment.id = int(re.search(r'item\?id=(\d+)$', td_default.find_all('a')[1]['href']).group(1), 10) except IndexError: raise CouldNotParse('Comment is dead') comment.username = td_default.find('a').find(text=True) # Get html contents of the comment excluding <span> and <font> if td_default.find('span', {'class': 'dead'}): comment.dead = True comment.text = utils.html2markup(td_default.find('span', {'class': 'comment'}).span.decode_contents()) hex_color = '#000000' else: comment.dead = False # TODO: BS4 doesn't handle <i> split over paragraphs. # Therefore there is a bug that will only add italics on the first paragraph comment.text = utils.html2markup(td_default.find('span', {'class': 'comment'}).find('font').decode_contents()) hex_color = td_default.find('span', {'class': 'comment'}).font['color'] # All colors are in the format of #XYXYXY, meaning that they are all grayscale. # Get percent by grabbing the red part of the color (#XY) comment.hiddenpercent = int(re.search(r'^#(\w{2})', hex_color).group(1), 16) / 2.5 comment.hiddencolor = hex_color comment.time = utils.parse_time(td_default.find('a').next_sibling + ' ago') # parsedatetime doesn't have any built in support for DST if time.localtime().tm_isdst == 1: comment.time = comment.time + datetime.timedelta(hours=-1) # Some extra trickery for permalinked comments if perma: parent_id = int(re.search(r'item\?id=(\d+)$', td_default.find_all('a')[2]['href']).group(1), 10) try: # Checking if the parent object is in the db parent_object = HNComments.objects.get(pk=parent_id) story_id = parent_object.story_id except HNComments.DoesNotExist: # Forcing comment to be updated next time, since it doesn't have proper values cache = timezone.now() - datetime.timedelta(days=1) parent_object = HNComments(id=parent_id, username='', parent=None, cache=cache) parent_object.save() # story_id is at this moment actually comment id of the parent object. # Trying to correct this by checking for actualy story_id in the db try: story_id = HNComments.objects.get(pk=story_id).story_id except HNComments.DoesNotExist: # Oops, looks like we'll just store a fake one for now pass comment.story_id = story_id comment.cache = timezone.now() comment.parent = parent_object comment.save() # HNCommentsCache(id=comment.id, time=timezone.now()).save() username = get_request().session.get('username', None) if username: # Adding vote auth to session userdata = get_request().session.setdefault('userdata', {}).setdefault(username, {}) try: auth_code = re.search(r'&auth=([a-z0-9]+)&whence', comment_soup.find_all('td', {'valign': 'top'})[0].a['href']).group(1) except (TypeError, AttributeError): auth_code = None userdata.setdefault('votes', {})[str(comment.id)] = auth_code get_request().session.modified = True # Traversing over child comments: # Since comments aren't actually children in the HTML we will have to parse all the siblings # and check if they have +1 indent indicating that they are a child. # However if a following comment has the same indent value it is not a child and neither a sub child # meaning that all child comments have been parsed. if not perma: indenting = int(td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 for sibling_soup in comment_soup.parent.parent.find_next_siblings('tr'): sibling_table = sibling_soup.table # Comment pages with a "More" link at the bottom will have two extra trs without a table if sibling_table: sibling_td_default = sibling_table.tr.find('td', {'class': 'default'}) sibling_indenting = int(sibling_td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 if sibling_indenting == indenting + 1: try: self.traverse_comment(sibling_table, comment, story_id) except CouldNotParse: continue if sibling_indenting == indenting: break elif sibling_soup.find('td', {'class': 'title'}): # TODO Add support for loading more comments continue
def traverse_comment(self, comment_soup, parent_object, story_id, perma=False): comment = HNComments() # Comment <td> container shortcut td_default = comment_soup.tr.find('td', {'class': 'default'}) # Retrieving comment id from the permalink try: comment.id = int( re.search(r'item\?id=(\d+)$', td_default.find_all('a')[1]['href']).group(1), 10) except IndexError: raise CouldNotParse('Comment is dead') comment.username = td_default.find('a').find(text=True) # Get html contents of the comment excluding <span> and <font> if td_default.find('span', {'class': 'dead'}): comment.dead = True comment.text = utils.html2markup( td_default.find('span', { 'class': 'comment' }).span.decode_contents()) hex_color = '#000000' else: comment.dead = False # TODO: BS4 doesn't handle <i> split over paragraphs. # Therefore there is a bug that will only add italics on the first paragraph comment.text = utils.html2markup( td_default.find('span', { 'class': 'comment' }).find('font').decode_contents()) hex_color = td_default.find('span', { 'class': 'comment' }).font['color'] # All colors are in the format of #XYXYXY, meaning that they are all grayscale. # Get percent by grabbing the red part of the color (#XY) comment.hiddenpercent = int( re.search(r'^#(\w{2})', hex_color).group(1), 16) / 2.5 comment.hiddencolor = hex_color comment.time = utils.parse_time( td_default.find('a').next_sibling + ' ago') # parsedatetime doesn't have any built in support for DST if time.localtime().tm_isdst == 1: comment.time = comment.time + datetime.timedelta(hours=-1) # Some extra trickery for permalinked comments if perma: parent_id = int( re.search(r'item\?id=(\d+)$', td_default.find_all('a')[2]['href']).group(1), 10) try: # Checking if the parent object is in the db parent_object = HNComments.objects.get(pk=parent_id) story_id = parent_object.story_id except HNComments.DoesNotExist: # Forcing comment to be updated next time, since it doesn't have proper values cache = timezone.now() - datetime.timedelta(days=1) parent_object = HNComments(id=parent_id, username='', parent=None, cache=cache) parent_object.save() # story_id is at this moment actually comment id of the parent object. # Trying to correct this by checking for actualy story_id in the db try: story_id = HNComments.objects.get(pk=story_id).story_id except HNComments.DoesNotExist: # Oops, looks like we'll just store a fake one for now pass comment.story_id = story_id comment.cache = timezone.now() comment.parent = parent_object comment.save() # HNCommentsCache(id=comment.id, time=timezone.now()).save() username = get_request().session.get('username', None) if username: # Adding vote auth to session userdata = get_request().session.setdefault('userdata', {}).setdefault( username, {}) try: auth_code = re.search( r'&auth=([a-z0-9]+)&whence', comment_soup.find_all( 'td', {'valign': 'top'})[0].a['href']).group(1) except (TypeError, AttributeError): auth_code = None userdata.setdefault('votes', {})[str(comment.id)] = auth_code get_request().session.modified = True # Traversing over child comments: # Since comments aren't actually children in the HTML we will have to parse all the siblings # and check if they have +1 indent indicating that they are a child. # However if a following comment has the same indent value it is not a child and neither a sub child # meaning that all child comments have been parsed. if not perma: indenting = int( td_default.previous_sibling.previous_sibling.img['width'], 10) / 40 for sibling_soup in comment_soup.parent.parent.find_next_siblings( 'tr'): sibling_table = sibling_soup.table # Comment pages with a "More" link at the bottom will have two extra trs without a table if sibling_table: sibling_td_default = sibling_table.tr.find( 'td', {'class': 'default'}) sibling_indenting = int( sibling_td_default.previous_sibling.previous_sibling. img['width'], 10) / 40 if sibling_indenting == indenting + 1: try: self.traverse_comment(sibling_table, comment, story_id) except CouldNotParse: continue if sibling_indenting == indenting: break elif sibling_soup.find('td', {'class': 'title'}): # TODO Add support for loading more comments continue