def __init__(self): args = ArgumentParser() self.count = args.text_start self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = outputer.HtmlOutputer()
def __init__(self, argvs): # self.url_manager = url_manager.UrlManager("./new_urls.txt", "./old_urls.txt", "./bad_urls.txt") self.html_downloader = html_downloader.HtmlDownloader() self.qa_list_parser = sqa_url_parser.QAListParser() self.qa_parser = sqa_url_parser.QAParser() # self.qa_list_saver = html_saver.HtmlSaver("./test.txt") self.qa_saver = html_saver.HtmlSaver("./actor_qa.txt-" + str(argvs[3]))
def __init__(self, tp, seed): self.tp = tp self.seed = seed self.visited = set() self.html_outputer = html_outputer.HtmlOutputer() self.html_parser = html_parser.HtmlParser() self.html_downloader = html_downloader.HtmlDownloader()
def get_details(self, url, app): html_content = html_downloader.HtmlDownloader().download(url) soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8') appabout = soup.find('div', class_='appabout') lables = appabout.find('em').find_all('a') for i in lables: app.lable.append(i.get_text()) ppp = appabout.find_all('p') for i in range(0, len(ppp)): text = ppp[i].find('em').get_text() if text == "大小:": app.size = self.getcontent(ppp[i].get_text()) elif text == '类别:': app.apptype = self.getcontent(ppp[i].get_text()) elif text == '版本:': app.version = self.getcontent(ppp[i].get_text()) elif text == '浏览次数:': app.hot = int(self.getcontent(ppp[i].get_text())) elif text == '页面最后更新时间:': app.lastupdatetime = self.getcontent(ppp[i].get_text()) appinfo = soup.find('div', class_='appinfo') description = appinfo.find('div', id='desc').get_text() description = description.replace('\t', '').replace('\r', '') lis = description.split('\n') for i in lis: if len(i) > 0: app.desc.append(i) imgsli = appinfo.find('div', id='showcase').find( 'div', class_='scrollbar').find('ul').find_all('li') for li in imgsli: imgpath = li.find('img')['src'] app.screenshot.append(imgpath)
def __init__(self): super(SpiderMain, self).__init__() self.ts_url = 'http://127.0.0.1:8000/api/lol/save_news' self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.ts = ts_data.TsData(self.ts_url)
def __init__(self): super(SpiderMain, self).__init__() self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() # self.targetKeywords = ['B-box', 'beatbox', 'bbox', 'Beatbox'] self.targetKeywords = ['三峡']
def __init__(self): self.url_manager = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.google_fetcher = html_google_fetcher.HtmlGoogleParser() self.pr_calculator = page_rank_util.PRCalculator() self.url_parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self, count, urls): threading.Thread.__init__(self) self.count = count self.urls = urls self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self,xing,ming): self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.xing=xing self.ming=ming self.param={'origin':'searchauthorlookup', 'src':'al', 'edit':'', 'poppUp':'', 'basicTab':'', 'affiliationTab':'', 'advancedTab':'', 'st1':xing, 'st2':ming, 'institute':'', '_exactSearch':'on', 'orcidId':'', #'authSubject':'LFSC', '_authSubject':'on', #'authSubject':'HLSC', '_authSubject':'on', #'authSubject':'PHSC', '_authSubject':'on', #'authSubject':'SOSC', '_authSubject':'on', 's':'AUTH--LAST--NAME({0}) AND AUTH--FIRST({1})'.format(ming,xing), 'sdt':'al', 'sot':'al', #'searchId':sid, #'sid':sid }
def __init__(self): # 初始化配置 cf = ConfigParser.ConfigParser() cf.read("config.conf") self.projectid = '%s' % cf.get("start", "project_id") self.root_url = '%s' % cf.get("start", "root_url") self.number = '%s' % cf.get("start", "number") # 启动表连接 db = mysqldbhand() db.dbconnect() db.init_tables(self.projectid) project = db.FindAll('project', '*', where='id= %s' % (self.projectid)) project_field = db.FindAll('project_field', '*', where='pid= %s' % (self.projectid)) self.tablename = project[0][2] + '_content' # 加载URL管理器 self.urls = url_manager.UrlManager(self.tablename) # 加载下载器 self.downloader = html_downloader.HtmlDownloader() # 加载页面解析器 self.parse = html_parser.HtmlParser(self.tablename, project, project_field) # 加载入库程序 self.outputer = html_outputer.HtmlOutputer(self.tablename)
def __init__(self, info): self.datas = [] self.urls = info['urls'] self.common = info['common'] self.pageNum = info['pageNum'] self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser()
def __init__(self): self.maxcount = 1000 #设置最大抓取数据数量 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.pdf = pdf_download.PdfDownload() self.pic = download.PicDowload()
def __init__(self,ty,cik): self.ty = ty #ty is the type of the filing self.cik = cik self.baseaddress = "https://www.sec.gov" #a downloader used to download html self.downloader = html_downloader.HtmlDownloader() #the first page of filing list self.filingListUrl = self.baseaddress + "/cgi-bin/browse-edgar?action=getcompany&CIK=" + self.cik + "&type=" + self.ty + "&dateb=&owner=exclude&count=100"
def __init__(self): self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.headers = { 'content-type': 'application/json;charset=utf8', 'Connection': 'close' } self.base_url = 'http://120.78.132.250:8084/admin_api'
def __init__(self): # 初始化所需要的对象,包括url管理器,网页下载器,网页解析器,输出器 # 来提供给craw()使用 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): """初始化 建立四个模块的实例 """ self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.download_list = [ 'tv', 'korean_tv', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu' ]
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.output = html_output.HtmlOutput() self.headers = { "User_Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36" }
def __init__(self, root_url): self.url = root_url self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_output.HtmlOutputer()
def __init__(self): self.urlManager = url_manager.UrlManager() self.parser = html_parser.HtmlParser() self.downloader = html_downloader.HtmlDownloader() self.collector = data_collector.DataCollector() self.lock = threading.Lock() #线程锁 self.local_crawed = threading.local() #创建全局ThreadLoacl对象,让每个线程拥有自己的数据。 self.count = 0 #全局爬取页面计数
def __init__(self, isuse, connection): self.config = config self.connection = connection self.urls = url_manager.UrlManager(connection, isuse) self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParse() self.outputer = html_outputer.HtmlOutputer() self.imgdownloader = img_downloader.ImgDownloader()
def __init__(self): #URL管理器 self.urls = url_manager.UrlManager() #HTML下载器 self.downloader = html_downloader.HtmlDownloader() #HTML解析器 self.parser = html_parser.HtmlParser() #HTML输出器 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # urls 作为管理器 self.urls = url_manager.UrlManager() # downloader作为下载器 self.downloader = html_downloader.HtmlDownloader() # parser作为解析器 self.parser = html_parser.HtmlParser() # outputer 将数据处理好的数据写出到 html 的页面 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # Url manager self.urls = url_manager.UrlManager() # Url downloader self.downloader = html_downloader.HtmlDownloader() # Url parser self.parser = html_parser.HtmlParser() # Url output device self.outputer = html_outputer.HtmlOutputer()
def __init__(self): super(SpiderMain, self).__init__() self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.uploader = data_uploader.DataUploader() self.localDataManager = local_data_manager.LocalDataManager() self.pdfDealer = fang_main.PDF_Data_Dealer(self.localDataManager)
def __init__(self): # 获取URL管理器 self.urls = url_manager.UrlManager() # 获取网页下载器 self.downloader = html_downloader.HtmlDownloader() # 获取网页解析器 self.parser = html_parser.HtmlParser() # 获取数据输出器 self.output = html_outputer.HtmlOutput()
def __init__(self): # url管理器 self.urls = url_manager.UrlManager() # 下载器 self.downloader = html_downloader.HtmlDownloader() # 解析器 self.parser = html_parser.HtmlParse() # 输出器 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager( ) # url管理器 管理2个集合 分别存放已抓取的url和待抓取的url # 提供4个方法:has_new_url, get_new_url, add_new_url, add_new_urls self.downloader = html_downloader.HtmlDownloader() #下载器 # 提供 1个方法download(url): 给定url返回字符串 self.parser = html_parser.HtmlParser() # html页面解析器 # 提供1个方法parse(new_url, html_cont) 返回页面解析得到的urls和data self.outputer = html_outputer.HtmlOutputer() # 输出器
def __init__(self): self.urls = url_manager.UrlManager() # url管理器 self.downloder = html_downloader.HtmlDownloader() # 网页下载器 self.parser = html_parser.HtmlParser() # 51JOB网页解析器 self.dataanalyse = data_analyse.DataAnalyse() #数据分析 self.datapicture = data_picture.DataPicture() #数据可视化 self.datapicturepie = data_picture_pie.ShowJodSalary() #数据可视化pie self.yy = yuyin.YuYin() #语音播报 #self.delect = del_huancun.DelHuancun() #清理语音数据缓存 self.user_agent = user_agent.Random_user_agent() #随机请求头