def get_book_pages(book_id): url = "http://book.douban.com/subject/%s/" % book_id soup = get_soup_by_url(url) tag = soup.find("div", {"id": "info"}) result = re.search(r">页数:</span> (\d+)<br", str(tag)) if result: return result.group(1) return 0
def save_note(url, date=None): soup = get_soup_by_url(url) tag = soup.find("div", {"class": "highlightText"}) text = "".join(tag.findAll(text=True)).strip() remark = "" tag = soup.find("div", {"class": "note"}) if tag: remark = "".join(tag.findAll(text=True)).replace("Note:", "").replace("@zzrt", "").strip() cover_tag = soup.find("div", {"class": "cover"}) tag = cover_tag.find("span", {"class": "title"}) if tag: book = "".join(tag.findAll(text=True)).strip() if "Personal Document" in book: book = "" else: book = "" tag = cover_tag.find("span", {"class": "author"}) if tag: author = "".join(tag.findAll(text=True)).replace(" by ", "").strip() else: author = "" if " " not in text and text[0] in ASCII_CHARS and len(text) <= 64: if Word.objects.filter(word=text).count() == 0: Word.objects.create(url=url, word=text) else: note = Note() note.url = url note.text = text note.added = date or datetime.datetime.now() if remark: note.remark = remark if book: note.book = book if author: note.author = author note.save()
def fetch(self): def is_home_page(url): return '/' not in url.replace('//', '').strip('/') try: soup = get_soup_by_url(self.url) except: logger.info("Time out when fetching HackerNews.") return # Reset articles before fetching self.articles = [] tags = soup.find("table").find_all("td", {"class": "title"}) for tag in tags: tag_a = tag.find('a') if (not tag_a) or \ ('href' not in tag_a.attrs) or \ (len(tag_a.contents) > 1) or \ (tag_a.string.lower() == "more" and '/' not in tag_a['href']): continue try: points = int(tag.parent.nextSibling.find('span').string.split(' ')[0]) except AttributeError, ValueError: points = 0 if 'http' not in tag_a['href']: tag_a['href'] = "http://news.ycombinator.com/" + tag_a['href'] if tag_a['href'] and points >= self.POINTS_MIN_LIMIT and (not is_home_page(tag_a['href'])): self.articles.append({ 'url': tag_a['href'], 'title': tag_a.string, 'points': points, })