def crawpages(urlp, selector, start=1, limit=-1): if urlp.find('%d') == -1: return soup = '' links = [] index = start while True: page = urlp % index print '-->', page new_soup = select(BeautifulSoup(urlopen(urlp % index).read()), selector) if new_soup is None or len(new_soup) == 0 or new_soup == soup: print 'repeat or none' print 'End at %s' % index break if limit > 0 and limit + 1 <= index: print 'limit ', limit print 'End at %s' % index break print getlisttext(new_soup) if (new_soup[0].name == 'a'): for link in new_soup: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) #if url.find("'")!=-1: continue why? url = url.split('#')[0] # remove location portion if url[0:4] == 'http': links.append(url) else: links.append(page) soup = new_soup index += 1 return links
def get_page_links(url): soup = Soup(load_page(url)) print 'open %s %s' % (soup.title.string.encode('utf-8'), url) links = select(soup, 'a.post-article') #links = soup.findAll('a', href=re.compile(r'^http://duoduovision.diandian.com/post/\d{4}-\d{2}-\d{2}/')) #return list(set([nomalize_url(a['href']) for a in links])) return [nomalize_url(a['href']) for a in links]
def crawpages(urlp, selector, start=1, limit=-1): if urlp.find('%d') == -1: return soup = '' links = [] index = start while True: page = urlp % index print '-->', page new_soup = select(BeautifulSoup(urlopen(urlp % index).read()), selector) if new_soup is None or len(new_soup)== 0 or new_soup == soup: print 'repeat or none' print 'End at %s' % index break if limit > 0 and limit + 1 <= index: print 'limit ', limit print 'End at %s' % index break print getlisttext(new_soup) if(new_soup[0].name == 'a'): for link in new_soup: if ('href' in dict(link.attrs)): url=urljoin(page, link['href']) #if url.find("'")!=-1: continue why? url=url.split('#')[0] # remove location portion if url[0:4]=='http': links.append(url) else: links.append(page) soup = new_soup index += 1 return links
def save_pagenates(leaf, limit= -1): """抓取有分页的url,他们的URL大多类似""" url_param = leaf.url index = 1 url = url_param % (index) filename = os.path.join(leaf.dir, "%s.html" % (index)) while file_exist("%s\t%s" % (filename, url)): print '%s is already downloaded.' % (url) index += 1 url = url_param % (index) filename = os.path.join(leaf.dir, "%s.html" % (index)) page = urlopen(url).read().decode(encode, 'ignore') soup = select(BeautifulSoup(page), config[leaf.level]['selector']) if soup == None or len(soup) == 0: print 'break 1'; return else: print 'page: ', index, getlisttext(soup) try: file = open(filename, 'w').write(page.encode(encode, 'ignore')) except IOError as err: print 'error: ', str(err), filename open(logfile, 'a').write(filename + '\t' + url + '\r\n') index += 1 url = url_param % (index) filename = os.path.join(leaf.dir, "%s.html" % (index)) while True: while file_exist("%s\t%s" % (filename, url)): print '%s is already downloaded.' % (url) index += 1 url = url_param % (index) filename = os.path.join(leaf.dir, "%s.html" % (index)) tmp_page = urlopen(url).read().decode(encode, 'ignore') tmp_soup = select(BeautifulSoup(tmp_page), config[leaf.level]['selector']) print tmp_soup print tmp_page == page, tmp_soup == soup if tmp_page == page or tmp_soup == soup or tmp_soup == None or len(tmp_soup) == 0: print 'break 1'; break else: print 'page: ', index, getlisttext(tmp_soup[0]) try: open(filename, 'w').write(tmp_page.encode(encode, 'ignore')) except: print filename open(logfile, 'a').write(filename + '\t' + url + '\r\n') page = tmp_page soup = tmp_soup index += 1 if limit > 0 and limit < index: break url = url_param % (index) filename = os.path.join(leaf.dir, "%s.html" % (index))
def create_children(map): if map.level >= len(config) - 1: return current_config = config[map.level] # here we create soup by url or soup from the parent 'map' if map.url != None: soup = BeautifulSoup(urlopen(map.url).read().decode(encode, 'ignore')) elif map.soup != None: soup = map.soup # we use the 'selector' to create html fragments we called soups soups = select(soup, current_config['selector']) # use keywords as a filter if 'keywords' in current_config: soups = keyword_filter(soups, current_config['keywords']) children = [] for i in range(len(soups)): # if the element is 'a' then we record url and make soup None, # if the element is sth else, we make url None and soup the # elements. if soups[i].name == 'a': # print soups[i]['href'], soups[i].string url = soups[i]['href'] # use the content as the title title = soups[i].string # if process is in the config, we will use it for url # transformation. if 'process' in current_config: url = current_config['process'](url) soup = None else: # if the name is not 'a', we may use the 'title' in # config, if 'title' startswith 'css:', we will use the # element's content as the title, or we use the 'title' # config directly. if current_config['title'].startswith('css:'): title_css = current_config['title'].split(':', 1)[1] title = select(soups[i], title_css)[0].string else: title = current_config['title'] url = None soup = soups[i] # if 'collapse' is True then, it will not create a new dir # with title as its name if 'collapse' not in current_config or current_config['collapse'] == False: if map.dir is None or map.dir == '': dir = title else: dir = os.path.join(map.dir, title) else: dir = map.dir children.append(LinkMap( parent=map, url=url, soup=soup, seletor=config[map.level + 1]['selector'], title=title, level=map.level + 1, dir=utils.formatname(dir))) # when there is no sub links in the title, we should give another # link, use the 'default_selector' if len(children) == 0 and 'default_selector' in current_config.keys(): soups = select(soup, current_config['default_selector']) url = soups[0]['href'] if 'process' in current_config: url = current_config['process'](url) if 'collapse' not in current_config or current_config['collapse'] == False: dir = os.path.join(map.dir, title) else: dir = map.dir children.append(LinkMap( parent=map, url=url, seletor=config[map.level + 1]['selector'], title=soups[0].string, level=map.level + 1, dir=utils.formatname(dir))) map.children = children for child in map.children: create_children(child)
def get_img_links(url): soup = Soup(load_page(url)) imgs = select(soup, 'a.post-meidaurl img') return [img['src'] for img in imgs]