Exemplo n.º 1
0
    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        file_path = '/home/dev/Data/rsyncData/test/feng_receptor.csv'
        items = self.file.readFromCSV(file_path)
        items.pop(0)

        for item in items:
            key = item[0]
            if key not in all_finished_id:
                name = key.strip()
                url = item[1]
                new_urls.append([url, name])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls,
                             2,
                             self.log_path,
                             None,
                             callback=self.parse)
Exemplo n.º 2
0
 def start(self):
     if self.doraemon.isSpiderReadyToRun() is False:
         message4 = 'It is not ready to run spider: {0}'.format(self.name)
         print message4
         return
     message5 = 'Start {0} requests'.format(self.name)
     self.file.logger(self.log_path, message5)
     print message5
     message6 = 'Start requests: {0} '.format(self.name)
     self.file.logger(self.log_path, message6)
     print message6
     new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_content, self.url_path)
     if len(new_url_titles) == 0:
         self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
         message7 = 'No new url for {0}'.format(self.name)
         self.file.logger(self.log_path, message7)
         print message7
         return
     request = BrowserRequest()
     content = request.start_chrome(new_url_titles, self.content_timeout, self.max_pool_size, self.log_path, None, callback=self.parse)
     self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
     message8 = 'End requests for {0}'.format(str(len(content)))
     self.file.logger(self.log_path, message8)
     print message8
     del content, new_url_titles, request
     gc.collect()
Exemplo n.º 3
0
    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/gongzhonghao_test.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                tmp_url = "https://chuansongme.com/account/{0}".format(key)
                new_urls.append([tmp_url, key])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
Exemplo n.º 4
0
    def start(self, isdebug=False):
        if self.doraemon.isCamelReadyToRun(
                self.settings) is False and isdebug is False:
            message5 = 'It is not ready to run for {0}'.format(self.name)
            print message5
            return
        message6 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6

        new_urls = []
        content = self.file.readFromTxt(self.urls)
        url_list = content.split('\n')

        for url in url_list:
            if self.doraemon.isEmpty(url) is False:
                new_urls.append([url, ''])

        if len(new_urls) == 0:
            print 'No url.'
            return
        request = BrowserRequest()
        content = request.start_chrome(new_urls,
                                       self.url_timeout,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file,
                                          self.max_concurrency)
        message7 = 'End for {0} requests of {1}.'.format(
            str(len(content)), self.name)
        self.file.logger(self.log_path, message7)
        print message7

        del new_urls, content, url_list, request
        gc.collect()
Exemplo n.º 5
0
    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/xueqiu.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://xueqiu.com/k?q={0}".format(name)
                new_urls.append([tmp_url, name])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls,
                             5,
                             self.log_path,
                             None,
                             callback=self.parse)
Exemplo n.º 6
0
    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/woshipm_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "http://www.woshipm.com/search-posts?k={0}".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
Exemplo n.º 7
0
    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/feng_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://so.v.ifeng.com/websearch/ifeng-search-server/sub/websearch?k={0}&page=1&distinct=1&n=10&hl=1&os=ios&gv=6.2.5&uid=70b6a1d8f6c64618bf9dfa092fc4e34c&callback=getData".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)