Exemplo n.º 1
0
    def import_review_history(self, content, wf_id, review_history, **kw):
        """Change the workflow state of an object
        @param content: Content obj which state will be changed
        @param review_history: Review history of the object
        @param wf_id: workflow name
        @param kw: change the values of same name of the state mapping
        @return: None
        """

        portal_workflow = api.get_tool('portal_workflow')

        # Might raise IndexError if no workflow is associated to this type
        for wf_def in portal_workflow.getWorkflowsFor(content):
            if wf_id == wf_def.getId():
                break
        else:
            logger.error("%s: Cannot find workflow id %s" % (content, wf_id))

        for rh in sorted(review_history, key=lambda k: k['time']):
            if not self.review_history_imported(content, rh, wf_def):
                portal_workflow.setStatusOf(wf_id, content,
                                            self.to_review_history_format(rh))

        wf_def.updateRoleMappingsFor(content)
        return
Exemplo n.º 2
0
 def _parents_fetched(self, item):
     """
     If data was fetched with portal type filter, this method will be used
     to fill the missing parents for fetched objects.
     :return: True if ALL parents are fetched
     """
     # Never fetch parents of an unnecessary objects
     if not utils.has_valid_portal_type(item):
         return False
     parent_path = item.get("parent_path")
     # Skip if the parent is portal object
     if self.is_portal_path(parent_path):
         return True
     # Skip if already exists
     if self.sh.find_unique(REMOTE_PATH, parent_path):
         return True
     logger.debug("Inserting missing parent: {}".format(parent_path))
     parent = self.get_first_item(item.get("parent_url"))
     if not parent:
         logger.error("Cannot fetch parent info: {} ".format(parent_path))
         return False
     par_dict = utils.get_soup_format(parent)
     self.sh.insert(par_dict)
     # Recursively import grand parents too
     return self._parents_fetched(parent)
Exemplo n.º 3
0
    def _handle_obj(self, row, handle_dependencies=True):
        """
        With the given dictionary:
            1. Creates object's slug
            2. Creates and updates dependencies of the object (which actually
               means this _handle_obj function will be called for the dependency
               if the dependency is not updated
            3. Updates the object

        :param row: A row dictionary from the souper
        :type row: dict
        """
        r_uid = row.get(REMOTE_UID)
        try:
            if row.get("updated", "0") == "1":
                return True
            self._queue.append(r_uid)
            obj = self._do_obj_creation(row)
            if obj is None:
                logger.error('Object creation failed: {}'.format(row))
                return
            obj_data = self.get_json(r_uid, complete=True, workflow=True)
            if handle_dependencies:
                self._create_dependencies(obj, obj_data)
            self._update_object_with_data(obj, obj_data)
            self._set_object_permission(obj)
            self.sh.mark_update(r_uid)
            self._queue.remove(r_uid)
        except Exception, e:
            self._queue.remove(r_uid)
            logger.error('Failed to handle {} : {} '.format(row, str(e)))
Exemplo n.º 4
0
 def mark_update(self, remote_uid):
     """
     Marks that record's object has been updated.
     """
     recs = [r for r in self.soup.query(Eq(REMOTE_UID, remote_uid))]
     if not recs:
         logger.error("Could not find any record with remote_uid: '{}'"
                      .format(remote_uid))
         return False
     recs[0].attrs[UPDATED] = "1"
     self.soup.reindex([recs[0]])
     return True
Exemplo n.º 5
0
    def _import_data(self):
        """
        For each UID from the fetched data, creates and updates objects
        step by step.
        :return:
        """
        logger.info("*** IMPORT DATA STARTED: {} ***".format(self.domain_name))

        self.sh = SoupHandler(self.domain_name)
        self.uids_to_reindex = []
        storage = self.get_storage()
        ordered_uids = storage["ordered_uids"]
        total_object_count = len(ordered_uids)
        start_time = datetime.now()

        for item_index, r_uid in enumerate(ordered_uids):
            row = self.sh.find_unique(REMOTE_UID, r_uid)
            logger.debug("Handling: {} ".format(row[REMOTE_PATH]))
            self._handle_obj(row)

            # Handling object means there is a chunk containing several objects
            # which have been created and updated. Reindex them now.
            self.uids_to_reindex = list(set(self.uids_to_reindex))
            for uid in self.uids_to_reindex:
                # It is possible that the object has a method (not a Field
                # in its Schema) which is used as an index and it fails.
                # TODO: Make sure reindexing won't fail!
                try:
                    obj = api.get_object_by_uid(uid)
                    obj.reindexObject()
                except Exception, e:
                    rec = self.sh.find_unique(LOCAL_UID, uid)
                    logger.error("Error while reindexing {} - {}".format(
                        rec, e))
            self._non_commited_objects += len(self.uids_to_reindex)
            self.uids_to_reindex = []

            # Commit the transaction if necessary
            if self._non_commited_objects > COMMIT_INTERVAL:
                transaction.commit()
                logger.info("Committed: {} / {} ".format(
                    self._non_commited_objects, total_object_count))
                self._non_commited_objects = 0

            # Log.info every 50 objects imported
            utils.log_process(task_name="Data Import",
                              started=start_time,
                              processed=item_index + 1,
                              total=total_object_count,
                              frequency=50)
Exemplo n.º 6
0
 def update_by_remote_path(self, remote_path, **kwargs):
     """
     Update the row by path column.
     :param path: path of the record
     :param kwargs: columns and their values to be updated.
     """
     recs = [r for r in self.soup.query(Eq(REMOTE_PATH, remote_path))]
     if not recs:
         logger.error("Could not find any record with path: '{}'"
                      .format(REMOTE_PATH))
         return False
     for k, v in kwargs.iteritems():
         recs[0].attrs[k] = v
     self.soup.reindex([recs[0]])
     return True
Exemplo n.º 7
0
 def update_by_remote_uid(self, remote_uid, **kwargs):
     """
     Update the row by remote_uid column.
     :param remote_uid: UID of the object in the source
     :param kwargs: columns and their values to be updated.
     """
     recs = [r for r in self.soup.query(Eq(REMOTE_UID, remote_uid))]
     if not recs:
         logger.error("Could not find any record with remote_uid: '{}'"
                      .format(remote_uid))
         return False
     for k, v in kwargs.iteritems():
         recs[0].attrs[k] = v
     self.soup.reindex([recs[0]])
     return True
Exemplo n.º 8
0
 def get_json(self, url_or_endpoint, **kw):
     """Fetch the given url or endpoint and return a parsed JSON object
     """
     api_url = self.get_api_url(url_or_endpoint, **kw)
     logger.info("get_json::url={}".format(api_url))
     try:
         response = self.session.get(api_url)
     except Exception as e:
         message = "Could not connect to {} Please check.".format(api_url)
         logger.error(e)
         self.add_status_message(message, "error")
         return {}
     status = response.status_code
     if status != 200:
         message = "GET for {} ({}) returned Status Code {}. Please check.".format(
             url_or_endpoint, api_url, status)
         self.add_status_message(message, "warning")
         return {}
     return response.json()
Exemplo n.º 9
0
    def reindex_updated_objects(self):
        """
        Reindexes updated objects.
        """
        total = len(self.uids_to_reindex)
        logger.info(
            'Reindexing {} objects which were updated...'.format(total))
        indexed = 0
        for uid in self.uids_to_reindex:
            obj = api.get_object_by_uid(uid[0], None)
            if obj is None:
                logger.error("Object not found: {} ".format(uid[1]))
                continue
            obj.reindexObject()
            indexed += 1
            if indexed % 100 == 0:
                logger.info('{} objects were reindexed, remain {}'.format(
                    indexed, total - indexed))

        logger.info('Reindexing finished...')
Exemplo n.º 10
0
    def _create_object_slug(self, container, data, *args, **kwargs):
        """Create an content object slug for the given data
        """
        id = data.get("id")
        remote_path = data.get("remote_path")
        portal_type = data.get("portal_type")
        types_tool = api.get_tool("portal_types")
        fti = types_tool.getTypeInfo(portal_type)
        if not fti:
            self.skipped.append(remote_path)
            logger.error("Type Info not found for {}".format(portal_type))
            return None
        logger.debug("Creating {} with ID {} in parent path {}".format(
            portal_type, id, api.get_path(container)))

        if fti.product:
            obj = _createObjectByType(portal_type, container, id)
        else:
            # new style factory
            factory = getUtility(IFactory, fti.factory)
            obj = factory(id, *args, **kwargs)
            if hasattr(obj, '_setPortalTypeName'):
                obj._setPortalTypeName(fti.getId())
            # notifies ObjectWillBeAddedEvent, ObjectAddedEvent and
            # ContainerModifiedEvent
            container._setObject(id, obj)
            # we get the object here with the current object id, as it
            # might be renamed
            # already by an event handler
            obj = container._getOb(obj.getId())

        # Be sure that Creation Flag is Cleared.
        if obj.checkCreationFlag():
            obj.unmarkCreationFlag()

        return obj
Exemplo n.º 11
0
    def update_object_with_data(self, obj, data, domain):
        """Update an existing object with data
        """

        # get the storage and UID map
        storage = self.get_storage(domain=domain)
        uidmap = storage["uidmap"]
        # Proxy Fields must be set after its dependency object is already set.
        # Thus, we will store all the ProxyFields and set them in the end
        proxy_fields = []

        for fieldname, field in api.get_fields(obj).items():

            fm = IFieldManager(field)
            value = data.get(fieldname)

            # handle JSON data reference fields
            if isinstance(value, dict) and value.get("uid"):
                # dereference the referenced object
                value = self.dereference_object(value.get("uid"), uidmap)
            elif isinstance(value, (list, tuple)):
                for item in value:
                    # If it is list of json data dict of objects, add local
                    # uid to that dictionary. This local_uid can be used in
                    # Field Managers.
                    if isinstance(item, dict):
                        for k, v in item.iteritems():
                            if 'uid' in k:
                                local_uid = uidmap.get(v)
                                item[k] = local_uid

            # handle file fields
            if field.type in ("file", "image", "blob"):
                if data.get(fieldname) is not None:
                    fileinfo = data.get(fieldname)
                    url = fileinfo.get("download")
                    filename = fileinfo.get("filename")
                    data["filename"] = filename
                    response = requests.get(url)
                    value = response.content

            # Leave the Proxy Fields for later
            if isinstance(fm, ProxyFieldManager):
                proxy_fields.append({
                    'field_name': fieldname,
                    'fm': fm,
                    'value': value
                })
                continue

            logger.info("Setting value={} on field={} of object={}".format(
                repr(value), fieldname, api.get_id(obj)))
            try:
                fm.set(obj, value)
            except:
                logger.error("Could not set field '{}' with value '{}'".format(
                    fieldname, value))

        # All reference fields are set. We can set the proxy fields now.
        for pf in proxy_fields:
            field_name = pf.get("field_name")
            fm = pf.get("fm")
            value = pf.get("value")
            logger.info("Setting value={} on field={} of object={}".format(
                repr(value), field_name, api.get_id(obj)))
            try:
                fm.set(obj, value)
            except:
                logger.error("Could not set field '{}' with value '{}'".format(
                    field_name, value))

        # Set the workflow states
        wf_info = data.get("workflow_info", [])
        for wf_dict in wf_info:
            wf_id = wf_dict.get("workflow")
            review_history = wf_dict.get("review_history")
            self.import_review_history(obj, wf_id, review_history)

        # finally reindex the object
        self.uids_to_reindex.append([api.get_uid(obj), repr(obj)])
Exemplo n.º 12
0
    def _fetch_data(self, window=1000, overlap=10):
        """Fetch data from the uid catalog in the source URL
        :param window: number of elements to be retrieved with each query to
                       the catalog
        :type window: int
        :param overlap: overlap between windows
        :type overlap: int
        :return:
        """
        logger.info("*** FETCHING DATA: {} ***".format(self.domain_name))
        start_time = datetime.now()
        storage = self.get_storage()
        storage["ordered_uids"] = []
        ordered_uids = storage["ordered_uids"]
        self.sh = SoupHandler(self.domain_name)
        # Dummy query to get overall number of items in the specified catalog
        query = {
            "url_or_endpoint": "search",
            "catalog": 'uid_catalog',
            "limit": 1
        }
        if self.full_sync_types:
            types = list()
            types.extend(self.full_sync_types + self.prefixable_types +
                         self.update_only_types + self.read_only_types)
            query["portal_type"] = types
        cd = self.get_json(**query)
        # Knowing the catalog length compute the number of pages we will need
        # with the desired window size and overlap
        effective_window = window - overlap
        # When we receive an error message in JSON response or we
        # don't get any response at all the key 'count' doesn't exist.
        if not cd.get("count", None):
            error_message = "Error message: {}".format(
                cd.get('message', None) or '')
            logger.error(
                "A query to the JSON API returned and error. {}".format(
                    error_message))
            return

        number_of_pages = (cd["count"] / effective_window) + 1
        # Retrieve data from catalog in batches with size equal to window,
        # format it and insert it into the import soup
        for current_page in xrange(number_of_pages):
            start_from = (current_page * window) - overlap
            query["limit"] = window
            query["b_start"] = start_from
            items = self.get_items_with_retry(**query)
            if not items:
                logger.error("CAN NOT GET ITEMS FROM {} TO {}".format(
                    start_from, start_from + window))
            for item in items:
                # skip object or extract the required data for the import
                if not self.is_item_allowed(item):
                    continue
                data_dict = utils.get_soup_format(item)
                rec_id = self.sh.insert(data_dict)
                ordered_uids.insert(0, data_dict[REMOTE_UID])
                if not self._parents_fetched(item):
                    logger.warning(
                        "Some parents are missing: {} ".format(item))

            utils.log_process(task_name="Pages fetched",
                              started=start_time,
                              processed=current_page + 1,
                              total=number_of_pages)

        logger.info("*** FETCHING DATA FINISHED: {} ***".format(
            self.domain_name))

        transaction.commit()
Exemplo n.º 13
0
    def _create_dependencies(self, obj, data):
        """
        Creates and updates objects' dependencies if they are not in the queue.
        Dependencies are found as UIDs in object data.
        :param obj: an object to get dependencies created
        :param data: object data
        """

        dependencies = []

        for fieldname, field in api.get_fields(obj).items():

            if fieldname in self.fields_to_skip:
                continue

            value = data.get(fieldname)

            if isinstance(value, dict) and value.get("uid"):
                dependencies.append(value.get("uid"))
            elif isinstance(value, (list, tuple)):
                for item in value:
                    if isinstance(item, dict):
                        for k, v in item.iteritems():
                            if 'uid' in k:
                                dependencies.append(v)

        logger.debug("Dependencies of {} are : {} ".format(
            repr(obj), dependencies))
        dependencies = list(set(dependencies))
        for r_uid in dependencies:
            dep_row = self.sh.find_unique(REMOTE_UID, r_uid)
            if dep_row is None:
                # If dependency doesn't exist in fetched data table,
                # just try to create its object for the first time
                dep_item = self.get_json(r_uid)
                if not dep_item:
                    logger.error(
                        "Remote UID not found in fetched data: {}".format(
                            r_uid))
                    continue
                if not utils.has_valid_portal_type(dep_item):
                    logger.error(
                        "Skipping dependency with unknown portal type:"
                        " {}".format(dep_item))
                    continue
                data_dict = utils.get_soup_format(dep_item)
                rec_id = self.sh.insert(data_dict)
                dep_row = self.sh.get_record_by_id(rec_id, as_dict=True)
                if self._parents_fetched(dep_item):
                    self._handle_obj(dep_row, handle_dependencies=False)
                continue

            # If Dependency is being processed, skip it.
            if r_uid in self._queue:
                continue

            # No need to handle already updated objects
            if dep_row.get("updated") == "0":
                self._handle_obj(dep_row)
            # Reindex dependency just in case it has a field that uses
            # BackReference of this object.
            else:
                self.uids_to_reindex.append(dep_row.get(LOCAL_UID))

        return True