def main(url, data, obj): ''' 主函数,在该函数中,完成以下功能: 1、获取给定url页面中的所有链接 2、判断链接的url,并添加参数 3、将转换完成的url存入文件。 ''' print '====BEGIN======' try: fetcher = Fetch(url=url, from_encoding='GBK') #定义Fetch实例 content = fetcher.get_content().renderContents() #获取content内容 # 利用Beautiful.Beautiful得到content内容, # renderContents()方法的作用? # for link in fetcher.get_all_link(): params = data.split(',') #处理如果存在参数 for param in params: if param in link: params.remove(param) newlink = '%s?%s' % (link, '&'.join(params)) #链接替换 content = content.replace('href="%s"' % link, 'href="%s"' % newlink) obj_file = open(obj, 'w') obj_file.write(content) obj_file.close() print '====OVER=======' except Exception, e: print 'an exception occur:%s' % str(e)
def main(url, data, obj): ''' 主函数,在该函数中,完成以下功能: 1、获取给定url页面中的所有链接 2、判断链接的url,并添加参数 3、将转换完成的url存入文件爱呢。 ''' print '====BEGIN======' try: fetcher = Fetch(url = url, from_encoding = 'GBK') content = fetcher.get_content().renderContents() for link in fetcher.get_all_link(): params = data.split(',') #处理如果存在参数 for param in params: if param in link: params.remove(param) newlink = '%s?%s' % (link,'&'.join(params)) #链接替换 content = content.replace('href="%s"' % link, 'href="%s"' % newlink) obj_file = open(obj, 'w') obj_file.write(content) obj_file.close() print '====OVER=======' except Exception,e: print 'an exception occur:%s' % str(e)
def do_work(argv): url = argv.get('url') workmanager = argv.get('workmanager') max_size = argv.get('max_size') fetcher = Fetch(url) for resource in fetcher.get_all_resource(): if len(resourcelist) > max_size: break if resource not in resourcelist: resourcelist.append(resource) logger.get_logger.info(resource) for href in fetcher.get_all_link(): if len(resourcelist) > max_size: break if href not in urllist: urllist.append(href) workmanager.add_job(do_work, workmanager=workmanager, url=href, max_size=max_size)