def get_resources_and_page(): for rp in Rootport.objects.filter(status=0).select_related("link"): headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Referer': 'http://www.zhihu.com/articles' } try: request = urllib2.Request(url=rp.link, headers=headers) response = urllib2.urlopen(request) content = response.read() except: pass else: if content: soup = BeautifulSoup(content) page_num_div = soup.find_all("div", class_="pagination") if page_num_div: page_num_div_str = BeautifulSoup(str(page_num_div[0])) page_nums = page_num_div_str.find_all("a") count = int(page_nums[-2].get('href')) rp.page_num = count rp.save() get_sub_page_resources(link=rp.link, num=count) results = soup.find_all(href=re.compile("magnet")) for result in results: link = result.get('href') title = result.get('title') bfr = BF_RESOURCES.add(link) if bfr is False: Resources.objects.create(title=title, link=link) keyworld_pages = soup.find_all(href=re.compile("information")) get_keyworld(keyworld_pages) rp.status = True rp.save()
def get_sub_page_resources(link=None, num=None): for i in range(1, num + 1): headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Referer': 'http://www.zhihu.com/articles' } try: request = urllib2.Request(url="{link}{i}".format(link=link, i=i), headers=headers) response = urllib2.urlopen(request) content = response.read() except: pass else: if content: soup = BeautifulSoup(urllib.quote(content)) results = soup.find_all(href=re.compile("magnet")) for result in results: sublink = result.get('href') title = result.get('title') bfr = BF_RESOURCES.add(sublink) if bfr is False: Resources.objects.create(title=title, link=sublink) keyworld_pages = soup.find_all(href=re.compile("information")) get_keyworld(keyworld_pages)
def get_resources_and_page(): for rp in Rootport.objects.filter(status=0).select_related("link"): headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Referer': 'http://www.zhihu.com/articles'} try: request = urllib2.Request(url=rp.link, headers=headers) response = urllib2.urlopen(request) content = response.read() except: pass else: if content: soup = BeautifulSoup(content) page_num_div = soup.find_all("div", class_="pagination") if page_num_div: page_num_div_str = BeautifulSoup(str(page_num_div[0])) page_nums = page_num_div_str.find_all("a") count = int(page_nums[-2].get('href')) rp.page_num = count rp.save() get_sub_page_resources(link=rp.link, num=count) results = soup.find_all(href=re.compile("magnet")) for result in results: link = result.get('href') title = result.get('title') bfr = BF_RESOURCES.add(link) if bfr is False: Resources.objects.create(title=title, link=link) keyworld_pages = soup.find_all(href=re.compile("information")) get_keyworld(keyworld_pages) rp.status = True rp.save()
def get_sub_page_resources(link=None, num=None): for i in range(1, num + 1): headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Referer': 'http://www.zhihu.com/articles'} try: request = urllib2.Request( url="{link}{i}".format(link=link, i=i), headers=headers) response = urllib2.urlopen(request) content = response.read() except: pass else: if content: soup = BeautifulSoup(urllib.quote(content)) results = soup.find_all(href=re.compile("magnet")) for result in results: sublink = result.get('href') title = result.get('title') bfr = BF_RESOURCES.add(sublink) if bfr is False: Resources.objects.create(title=title, link=sublink) keyworld_pages = soup.find_all(href=re.compile("information")) get_keyworld(keyworld_pages)