Exemplo n.º 1
0
 def __init__(self):
     args = ArgumentParser()
     self.count = args.text_start
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = outputer.HtmlOutputer()
Exemplo n.º 2
0
 def __init__(self, argvs):
     # self.url_manager = url_manager.UrlManager("./new_urls.txt", "./old_urls.txt", "./bad_urls.txt")
     self.html_downloader = html_downloader.HtmlDownloader()
     self.qa_list_parser = sqa_url_parser.QAListParser()
     self.qa_parser = sqa_url_parser.QAParser()
     # self.qa_list_saver = html_saver.HtmlSaver("./test.txt")
     self.qa_saver = html_saver.HtmlSaver("./actor_qa.txt-" + str(argvs[3]))
Exemplo n.º 3
0
 def __init__(self, tp, seed):
     self.tp = tp
     self.seed = seed
     self.visited = set()
     self.html_outputer = html_outputer.HtmlOutputer()
     self.html_parser = html_parser.HtmlParser()
     self.html_downloader = html_downloader.HtmlDownloader()
Exemplo n.º 4
0
 def get_details(self, url, app):
     html_content = html_downloader.HtmlDownloader().download(url)
     soup = BeautifulSoup(html_content,
                          'html.parser',
                          from_encoding='utf-8')
     appabout = soup.find('div', class_='appabout')
     lables = appabout.find('em').find_all('a')
     for i in lables:
         app.lable.append(i.get_text())
     ppp = appabout.find_all('p')
     for i in range(0, len(ppp)):
         text = ppp[i].find('em').get_text()
         if text == "大小:":
             app.size = self.getcontent(ppp[i].get_text())
         elif text == '类别:':
             app.apptype = self.getcontent(ppp[i].get_text())
         elif text == '版本:':
             app.version = self.getcontent(ppp[i].get_text())
         elif text == '浏览次数:':
             app.hot = int(self.getcontent(ppp[i].get_text()))
         elif text == '页面最后更新时间:':
             app.lastupdatetime = self.getcontent(ppp[i].get_text())
     appinfo = soup.find('div', class_='appinfo')
     description = appinfo.find('div', id='desc').get_text()
     description = description.replace('\t', '').replace('\r', '')
     lis = description.split('\n')
     for i in lis:
         if len(i) > 0:
             app.desc.append(i)
     imgsli = appinfo.find('div', id='showcase').find(
         'div', class_='scrollbar').find('ul').find_all('li')
     for li in imgsli:
         imgpath = li.find('img')['src']
         app.screenshot.append(imgpath)
Exemplo n.º 5
0
 def __init__(self):
     super(SpiderMain, self).__init__()
     self.ts_url = 'http://127.0.0.1:8000/api/lol/save_news'
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.ts = ts_data.TsData(self.ts_url)
Exemplo n.º 6
0
    def __init__(self):
        super(SpiderMain, self).__init__()

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 7
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     # self.targetKeywords = ['B-box', 'beatbox', 'bbox', 'Beatbox']
     self.targetKeywords = ['三峡']
 def __init__(self):
     self.url_manager = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.google_fetcher = html_google_fetcher.HtmlGoogleParser()
     self.pr_calculator = page_rank_util.PRCalculator()
     self.url_parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 9
0
 def __init__(self, count, urls):
     threading.Thread.__init__(self)
     self.count = count
     self.urls = urls
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 10
0
 def __init__(self,xing,ming):
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.xing=xing
     self.ming=ming
     self.param={'origin':'searchauthorlookup',
                'src':'al',
                'edit':'',
                'poppUp':'',
                'basicTab':'',
                'affiliationTab':'',
                'advancedTab':'',
                'st1':xing,
                'st2':ming,
                'institute':'',
                '_exactSearch':'on',
                'orcidId':'',
                #'authSubject':'LFSC',
                '_authSubject':'on',
                #'authSubject':'HLSC',
                 '_authSubject':'on',
                 #'authSubject':'PHSC',
                 '_authSubject':'on',
                 #'authSubject':'SOSC',
                 '_authSubject':'on',
                 's':'AUTH--LAST--NAME({0}) AND AUTH--FIRST({1})'.format(ming,xing),
                 'sdt':'al',
                 'sot':'al',
                 #'searchId':sid,
                 #'sid':sid
                }
Exemplo n.º 11
0
    def __init__(self):
        # 初始化配置
        cf = ConfigParser.ConfigParser()
        cf.read("config.conf")
        self.projectid = '%s' % cf.get("start", "project_id")
        self.root_url = '%s' % cf.get("start", "root_url")
        self.number = '%s' % cf.get("start", "number")

        # 启动表连接
        db = mysqldbhand()
        db.dbconnect()
        db.init_tables(self.projectid)
        project = db.FindAll('project', '*', where='id= %s' % (self.projectid))
        project_field = db.FindAll('project_field',
                                   '*',
                                   where='pid= %s' % (self.projectid))
        self.tablename = project[0][2] + '_content'
        # 加载URL管理器
        self.urls = url_manager.UrlManager(self.tablename)
        # 加载下载器
        self.downloader = html_downloader.HtmlDownloader()
        # 加载页面解析器
        self.parse = html_parser.HtmlParser(self.tablename, project,
                                            project_field)
        # 加载入库程序
        self.outputer = html_outputer.HtmlOutputer(self.tablename)
Exemplo n.º 12
0
 def __init__(self, info):
     self.datas = []
     self.urls = info['urls']
     self.common = info['common']
     self.pageNum = info['pageNum']
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
Exemplo n.º 13
0
 def __init__(self):
     self.maxcount = 1000  #设置最大抓取数据数量
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.pdf = pdf_download.PdfDownload()
     self.pic = download.PicDowload()
Exemplo n.º 14
0
 def __init__(self,ty,cik): 
     self.ty = ty          #ty is the type of the filing
     self.cik = cik
     self.baseaddress = "https://www.sec.gov"
     #a downloader used to download html
     self.downloader = html_downloader.HtmlDownloader()
     #the first page of filing list
     self.filingListUrl = self.baseaddress + "/cgi-bin/browse-edgar?action=getcompany&CIK=" + self.cik + "&type=" + self.ty + "&dateb=&owner=exclude&count=100"
Exemplo n.º 15
0
 def __init__(self):
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.headers = {
         'content-type': 'application/json;charset=utf8',
         'Connection': 'close'
     }
     self.base_url = 'http://120.78.132.250:8084/admin_api'
Exemplo n.º 16
0
    def __init__(self):
        # 初始化所需要的对象,包括url管理器,网页下载器,网页解析器,输出器
        # 来提供给craw()使用

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 17
0
 def __init__(self):
     """初始化
     建立四个模块的实例
     """
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.download_list = [
         'tv', 'korean_tv', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu'
     ]
Exemplo n.º 19
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.output = html_output.HtmlOutput()
     self.headers = {
         "User_Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36"
     }
Exemplo n.º 20
0
    def __init__(self, root_url):

        self.url = root_url

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_output.HtmlOutputer()
Exemplo n.º 21
0
 def __init__(self):
     self.urlManager = url_manager.UrlManager()
     self.parser = html_parser.HtmlParser()
     self.downloader = html_downloader.HtmlDownloader()
     self.collector = data_collector.DataCollector()
     self.lock = threading.Lock()  #线程锁
     self.local_crawed = threading.local()  #创建全局ThreadLoacl对象,让每个线程拥有自己的数据。
     self.count = 0  #全局爬取页面计数
Exemplo n.º 22
0
 def __init__(self, isuse, connection):
     self.config = config
     self.connection = connection
     self.urls = url_manager.UrlManager(connection, isuse)
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParse()
     self.outputer = html_outputer.HtmlOutputer()
     self.imgdownloader = img_downloader.ImgDownloader()
Exemplo n.º 23
0
 def __init__(self):
     #URL管理器
     self.urls = url_manager.UrlManager()
     #HTML下载器
     self.downloader = html_downloader.HtmlDownloader()
     #HTML解析器
     self.parser = html_parser.HtmlParser()
     #HTML输出器
     self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 24
0
 def __init__(self):
     # urls 作为管理器
     self.urls = url_manager.UrlManager()
     # downloader作为下载器
     self.downloader = html_downloader.HtmlDownloader()
     # parser作为解析器
     self.parser = html_parser.HtmlParser()
     # outputer 将数据处理好的数据写出到 html 的页面
     self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 25
0
 def __init__(self):
     # Url manager
     self.urls = url_manager.UrlManager()
     # Url downloader
     self.downloader = html_downloader.HtmlDownloader()
     # Url parser
     self.parser = html_parser.HtmlParser()
     # Url output device
     self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 26
0
 def __init__(self):
     super(SpiderMain, self).__init__()
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.uploader = data_uploader.DataUploader()
     self.localDataManager = local_data_manager.LocalDataManager()
     self.pdfDealer = fang_main.PDF_Data_Dealer(self.localDataManager)
Exemplo n.º 27
0
 def __init__(self):
     # 获取URL管理器
     self.urls = url_manager.UrlManager()
     # 获取网页下载器
     self.downloader = html_downloader.HtmlDownloader()
     # 获取网页解析器
     self.parser = html_parser.HtmlParser()
     # 获取数据输出器
     self.output = html_outputer.HtmlOutput()
Exemplo n.º 28
0
 def __init__(self):
     # url管理器
     self.urls = url_manager.UrlManager()
     # 下载器
     self.downloader = html_downloader.HtmlDownloader()
     # 解析器
     self.parser = html_parser.HtmlParse()
     # 输出器        
     self.outputer = html_outputer.HtmlOutputer()
Exemplo n.º 29
0
 def __init__(self):
     self.urls = url_manager.UrlManager(
     )  # url管理器 管理2个集合 分别存放已抓取的url和待抓取的url
     # 提供4个方法:has_new_url, get_new_url, add_new_url, add_new_urls
     self.downloader = html_downloader.HtmlDownloader()  #下载器
     # 提供 1个方法download(url): 给定url返回字符串
     self.parser = html_parser.HtmlParser()  # html页面解析器
     # 提供1个方法parse(new_url, html_cont) 返回页面解析得到的urls和data
     self.outputer = html_outputer.HtmlOutputer()  # 输出器
Exemplo n.º 30
0
 def __init__(self):
     self.urls = url_manager.UrlManager()  # url管理器
     self.downloder = html_downloader.HtmlDownloader()  # 网页下载器
     self.parser = html_parser.HtmlParser()  # 51JOB网页解析器
     self.dataanalyse = data_analyse.DataAnalyse()  #数据分析
     self.datapicture = data_picture.DataPicture()  #数据可视化
     self.datapicturepie = data_picture_pie.ShowJodSalary()  #数据可视化pie
     self.yy = yuyin.YuYin()  #语音播报
     #self.delect = del_huancun.DelHuancun()  #清理语音数据缓存
     self.user_agent = user_agent.Random_user_agent()  #随机请求头