Exemplos de Doraemon.getAllHasSet em Python, exemplos de Tegenaria.tSpider.tSpider.middlewares.doraemonMiddleware.Doraemon.getAllHasSet em Python

Exemplo n.º 1

0

Exibir arquivo

class FengReceptorContent():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "feng_receptor_content"
        self.finished_ids = "feng_receptor_content"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath(".//*[contains(@class, 'newLine-4rktaWav')]")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        texts = href_contens[0].xpath(
            ".//*[contains(@class, 'time-RyJJYUOX')]/text()")
        time_source = ''.join(texts).strip()
        self.doraemon.hashSet(self.finished_ids, current_url, current_url)
        data = {'id': key, 'url': current_url, 'date': time_source}
        print 'Start to store mongo {0}'.format(data['url'])
        self.doraemon.storeMongodb(self.mongo, data)
        print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        file_path = '/home/dev/Data/rsyncData/test/feng_receptor.csv'
        items = self.file.readFromCSV(file_path)
        items.pop(0)

        for item in items:
            key = item[0]
            if key not in all_finished_id:
                name = key.strip()
                url = item[1]
                new_urls.append([url, name])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls,
                             2,
                             self.log_path,
                             None,
                             callback=self.parse)

Exemplo n.º 2

0

Exibir arquivo

class ChuansongmeReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/"
        self.mongo = "gongzhonghao_test"
        self.finished_ids = "gongzhonghao_test"
        self.log_path = "/home/dev/Data/rsyncData/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_item = html.xpath("./*[contains(@class, 'pagedlist_item')]")
        if len(href_item) == 0:
            print 'No data for: {0}'.format(key)
            return
        self.doraemon.hashSet(self.finished_ids, key, key)
        data = {
            'id': key,
            'url': current_url
        }
        print 'Start to store mongo {0}'.format(data['url'])
        self.doraemon.storeMongodb(self.mongo, data)
        print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/gongzhonghao_test.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                tmp_url = "https://chuansongme.com/account/{0}".format(key)
                new_urls.append([tmp_url, key])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)

Exemplo n.º 3

0

Exibir arquivo

class XueqiuReceptor():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "xueqiu_test"
        self.finished_ids = "xueqiu_test"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath(
            ".//*[contains(@class, 'search__user__card__content')]")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        for item in href_contens:
            href = item.xpath(".//*[contains(@class, 'user-name')]/@href")
            title_content = item.xpath(
                ".//*[contains(@class, 'user-name')]//span/text()")
            title = "".join(title_content).strip()
            if len(href) > 0 and title == key:
                url = "https://xueqiu.com/u{0}".format(href[0])
                self.doraemon.hashSet(self.finished_ids, url, url)
                data = {'id': key, 'url': url}
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/xueqiu.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://xueqiu.com/k?q={0}".format(name)
                new_urls.append([tmp_url, name])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls,
                             5,
                             self.log_path,
                             None,
                             callback=self.parse)

Exemplo n.º 4

0

Exibir arquivo

class WoshipmReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "whoispm_receptor"
        self.finished_ids = "woshipm_receptor"
        self.log_path = "/home/dev/Data/rsyncData/test/"
        self.regx = re.compile("/u/[0-9]{0,}")

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath("./a")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        for item in href_contens:
            href = item.xpath("@href")
            title_content = item.xpath(".//text()")
            title = "".join(title_content).strip()
            if len(href) > 0 and title == key:
                isValidUrl = self.regx.match(href[0])
                if isValidUrl is None:
                    print 'Invalid url for not match: {0}'.format(href[0])
                    continue
                url = "http://www.woshipm.com{0}".format(href[0])
                self.doraemon.hashSet(self.finished_ids, url, url)
                data = {
                    'id': key,
                    'url': url
                }
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/woshipm_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "http://www.woshipm.com/search-posts?k={0}".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)

Exemplo n.º 5

0

Exibir arquivo

class FengReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "feng_receptor"
        self.finished_ids = "feng_receptor"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        key = response['request_title'].strip()
        str = response['response'].page_source.encode('utf-8')
        str_n = str[str.find('(') + 1:-21]
        str_n = str_n.replace('null', 'None')
        dics = eval(str_n)
        if len(dics['items']) == 0:
            print 'No data for: {0}'.format(key)
            self.doraemon.hashSet(self.finished_ids, key, key)
            return
        for item in dics['items']:
            name = item['name'].replace('&lt;','').replace('em&gt;','').replace('\\/','')
            id = item['id']
            if len(id) > 0 and name == key:
                url = "https://feng.ifeng.com/author/{0}".format(id)
                self.doraemon.hashSet(self.finished_ids, key, key)
                data = {
                    'id': key,
                    'url': url
                }
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/feng_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://so.v.ifeng.com/websearch/ifeng-search-server/sub/websearch?k={0}&page=1&distinct=1&n=10&hl=1&os=ios&gv=6.2.5&uid=70b6a1d8f6c64618bf9dfa092fc4e34c&callback=getData".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)