def get_items(self, path, depth=0):
        if path and path[-1] == '/':
            path = path[:-1]
        if self.remote_crawl_depth == -1 or depth <= self.remote_crawl_depth:

            item, subitems = self.get_remote_item(path)

            if item is None:
                logger.warn(':: Skipping -> %s. No remote data.' % path)
                return

            if item.startswith('ERROR'):
                logger.error(
                    "Could not get item '%s' from remote. Got %s." %
                    (path, item))
                return

            try:
                item = json.loads(item)
            except json.JSONDecodeError:
                logger.error(
                    "Could not decode item from path '%s' as JSON." % path)
                return
            logger.info(':: Crawling %s' % item['_path'])

            # item['_path'] is relative to domain root. we need relative to
            # plone root
            remote_url = self.remote_url
            _, _, remote_path, _, _, _ = urlparse.urlparse(remote_url)
            item['_path'] = item['_path'][len(remote_path):]
            if item['_path'].startswith('/'):
                item['_path'] = item['_path'][1:]

            if item['_type'] == "Plone Site":
                pass
            else:
                yield item

            if subitems.startswith('ERROR'):
                logger.error(
                    "Could not get subitems for '%s'. Got %s." %
                    (path, subitems))
                return

            for subitem_id in json.loads(subitems):
                subitem_path = path + '/' + subitem_id

                if subitem_path[len(self.remote_path):]\
                        in self.remote_skip_path:
                    logger.info(':: Skipping -> ' + subitem_path)
                    continue

                for subitem in self.get_items(subitem_path, depth + 1):
                    yield subitem
Пример #2
0
    def get_items(self, path, depth=0):
        if path and path[-1] == '/':
            path = path[:-1]

        if self.remote_crawl_depth == -1 or depth <= self.remote_crawl_depth:

            item, subitems = self.get_remote_item(path)

            if item is None:
                logger.warn(':: Skipping -> %s. No remote data.' % path)
                return

            if item.startswith('ERROR'):
                logger.error("Could not get item '%s' from remote. Got %s." % (path, item))
                return

            item = simplejson.loads(item)
            logger.info(':: Crawling %s' % item['_path'])

            if self.local_path:
                item['_path'] = self.local_path + item['_path'][len(self.remote_path):] 
            # item['_path'] is relative to domain root. we need relative to plone root
#            remote_url = self.remote_url
#            _,_,remote_path,_,_,_ = urlparse.urlparse(remote_url)
#            item['_path'] = item['_path'][len(remote_path):]
#            if item['_path'].startswith('/'):
#                item['_path'] = item['_path'][1:]

            if item['_type'] == "Plone Site":
                pass
            else:
                yield item

            if subitems.startswith('ERROR'):
                logger.error("Could not get subitems for '%s'. Got %s." % (path, subitems))
                return

            for subitem_id in simplejson.loads(subitems):
                subitem_path = path + '/' + subitem_id

                if subitem_path[len(self.remote_path):] in self.remote_skip_path:
                    logger.info(':: Skipping -> ' + subitem_path)
                    continue

                if self.remote_catalog_query:            
                    if subitem_path not in self.remote_ok_path:
                        logger.info(':: Skipping (2) -> ' + subitem_path)
                        continue

                for subitem in self.get_items(subitem_path, depth+1):

                    yield subitem
Пример #3
0
    def get_items(self, path, depth=0):
        if path and path[-1] == '/':
            path = path[:-1]
        if self.remote_crawl_depth == -1 or depth <= self.remote_crawl_depth:

            item, subitems = self.get_remote_item(path)

            if item is None:
                logger.warn(':: Skipping -> %s. No remote data.' % path)
                return

            if item.startswith(b'ERROR'):
                logger.error("Could not get item '%s' from remote. Got %s." %
                             (path, item))
                return

            try:
                item = json.loads(item)
            except JSONDecodeError:
                logger.error("Could not decode item from path '%s' as JSON." %
                             path)
                return
            logger.info(':: Crawling %s' % item['_path'])

            # item['_path'] is relative to domain root. we need relative to
            # plone root
            remote_url = self.remote_url
            _, _, remote_path, _, _, _ = urllib.parse.urlparse(remote_url)
            item['_path'] = item['_path'][len(remote_path):]
            if item['_path'].startswith('/'):
                item['_path'] = item['_path'][1:]

            if item['_type'] == "Plone Site":
                pass
            else:
                yield item

            if subitems.startswith(b'ERROR'):
                logger.error("Could not get subitems for '%s'. Got %s." %
                             (path, subitems))
                return

            for subitem_id in json.loads(subitems):
                subitem_path = path + '/' + subitem_id

                if subitem_path[len(self.remote_path):]\
                        in self.remote_skip_path:
                    logger.info(':: Skipping -> ' + subitem_path)
                    continue

                for subitem in self.get_items(subitem_path, depth + 1):
                    yield subitem
    def __iter__(self):
        for item in self.previous:
            yield item
        offset = int(self.options.get("offset", "0"))
        limit = int(self.options.get("limit", "0"))
        counter = 0
        if hasattr(self.transmogrifier, "jsonmigrator_offset"):
            # truncate results when live importing?
            # inject the parameters bellow in the
            # transmogrifier object if you are
            # doing an interactive import of small chunks
            # in the python prompt
            print ("total results: ", len(self.item_paths))
            hard_limit = self.transmogrifier.jsonmigrator_offset + self.transmogrifier.jsonmigrator_limit
            self.item_paths = self.item_paths[
                                self.transmogrifier.jsonmigrator_offset:
                                hard_limit]
            logger.warn("Migrating %d items from position %s" %
                            (self.transmogrifier.jsonmigrator_limit,
                             self.transmogrifier.jsonmigrator_offset))

        for path in self.item_paths:
            skip = False
            if not counter % 100:
                print counter
            counter += 1
            if counter < offset:
                 logger.debug("Skipping item n.# %d at %s " % (counter, path))
                 continue
            if limit and counter > (offset + limit):
                 logger.debug("Post skipping item n.# %d at %s " % (counter, path))
                 continue
            for skip_path in self.remote_skip_paths:
                if path.startswith(skip_path):
                    skip = True
            if not skip:
                item = self.get_remote_item(path)
                if item:
                    item['_path'] = item['_path'][self.site_path_length:]
                    yield item