def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        if doc_type in self.routing and 'parentField' in self.routing[doc_type]:
            # We can't use delete() directly here and have to do a full search first.
            # This is due to the fact that Elasticsearch needs the parent ID to
            # know where to route the delete request. We might not have the parent
            # ID available in our remove request though.
            document = self._search_doc_by_id(index, doc_type, document_id)
            if document is None:
                LOG.error('Could not find document with ID "%s" in Elasticsearch to apply remove', u(document_id))
                return
            parent_id = self._get_parent_id(doc_type, document)
            self.elastic.delete(index=index, doc_type=doc_type,
                                id=u(document_id), parent=parent_id,
                                refresh=(self.auto_commit_interval == 0))
        else:
            self.elastic.delete(index=index, doc_type=doc_type,
                                id=u(document_id),
                                refresh=(self.auto_commit_interval == 0))

        self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))
Пример #2
0
        def check_update(update_spec):
            updated = self.conn.test.command(
                SON([('findAndModify', 'test'),
                     ('query', {"a": 0}),
                     ('update', update_spec),
                     ('new', True)]))['value']

            # Stringify _id to match what will be retrieved from Solr
            updated[u('_id')] = u(updated['_id'])
            # Flatten the MongoDB document to match Solr
            updated = docman._clean_doc(updated, 'dummy.namespace', 0)
            # Allow some time for update to propagate
            time.sleep(3)
            replicated = list(self._search("a:0"))[0]

            # Remove add'l fields until these are stored in a separate Solr core
            updated.pop('_ts')
            replicated.pop('_ts')
            updated.pop('ns')
            replicated.pop('ns')

            # Remove field added by Solr
            replicated.pop("_version_")

            self.assertEqual(replicated, updated)
Пример #3
0
        def check_update(update_spec):
            updated = self.conn.test.command(
                SON([('findAndModify', 'test'), ('query', {
                    "a": 0
                }), ('update', update_spec), ('new', True)]))['value']

            # Stringify _id to match what will be retrieved from Solr
            updated[u('_id')] = u(updated['_id'])
            # Flatten the MongoDB document to match Solr
            updated = docman._clean_doc(updated, 'dummy.namespace', 0)
            # Remove add'l fields until these are stored in a separate Solr core
            updated.pop('_ts')
            updated.pop('ns')

            def update_worked():
                replicated = list(self._search("a:0"))[0]
                # Remove add'l fields until these are stored in a separate
                # Solr core
                replicated.pop('_ts')
                replicated.pop('ns')
                # Remove field added by Solr
                replicated.pop("_version_")

                return replicated == updated

            # Allow some time for update to propagate
            assert_soon(update_worked)
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        parent_args = {}
        if self._is_child_type(index, doc_type):
            # We can't use delete() directly here and have to do a full search
            # first. This is due to the fact that Elasticsearch needs the
            # parent ID to know where to route the delete request. We do
            # not have the parent ID available in our remove request though.
            document = self._search_doc_by_id(index, doc_type, document_id)
            if document is None:
                LOG.error('Could not find document with ID "%s" in '
                          'Elasticsearch to apply remove', u(document_id))
                return

            parent_id = self._get_parent_id_from_elastic(document)
            if parent_id is not None:
                parent_args['parent'] = parent_id

        self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id),
                            refresh=(self.auto_commit_interval == 0),
                            **parent_args)

        self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                            id=u(document_id),
                            refresh=(self.auto_commit_interval == 0))
Пример #5
0
 def remove(self, doc):
     """Remove a document from Elasticsearch."""
     self.elastic.delete(index=doc['ns'], doc_type=self.doc_type,
                         id=u(doc["_id"]),
                         refresh=(self.auto_commit_interval == 0))
     self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                         id=u(doc["_id"]),
                         refresh=(self.auto_commit_interval == 0))
Пример #6
0
 def remove(self, document_id, namespace, timestamp):
     """Remove a document from Elasticsearch."""
     index, doc_type = self._index_and_mapping(namespace)
     self.elastic.delete(index=index, doc_type=doc_type,
                         id=u(document_id),
                         refresh=(self.auto_commit_interval == 0))
     self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                         id=u(document_id),
                         refresh=(self.auto_commit_interval == 0))
 def remove(self, document_id, namespace, timestamp):
     """Remove a document from Elasticsearch."""
     index, doc_type = self._index_and_mapping(namespace)
     self.elastic.delete(index=index, doc_type=doc_type,
                         id=u(document_id),
                         refresh=(self.auto_commit_interval == 0))
     self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type,
                         id=u(document_id),
                         refresh=(self.auto_commit_interval == 0))
 def remove(self, doc):
     """Remove a document from Elasticsearch."""
     self.elastic.delete(index=doc['ns'],
                         doc_type=self.doc_type,
                         id=u(doc["_id"]),
                         refresh=(self.auto_commit_interval == 0))
     self.elastic.delete(index=self.meta_index_name,
                         doc_type=self.meta_type,
                         id=u(doc["_id"]),
                         refresh=(self.auto_commit_interval == 0))
Пример #9
0
    def remove(self, document_id, namespace, timestamp):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        if document_id:
            self.solr.delete(id=u(document_id),
                             commit=(self.auto_commit_interval == 0))
            self.solr.delete(q=u("_id:" + document_id + "_*"),
                             commit=(self.auto_commit_interval == 0))
        else:
            raise errors.OperationFailed(
                "delete solr document error for the id(%s) is not valid" %
                str(document_id))
Пример #10
0
 def build_nodes_query(self, doc_type, document, id):
     self.doc_types.append(doc_type)
     parameters = {'_id': id}
     if self.is_dict(self.metadata):
         parameters.update(self.metadata)
     for key in document.keys():
         if self.is_reference(key):
             self.build_node_with_reference(doc_type, key, id,
                                            document[key])
             continue
         if self.is_objectid(document[key]):
             parameters.update({key: u(document[key])})
             self.build_node_with_reference(doc_type, key, id,
                                            document[key])
             continue
         #TODO: handle arrays of ObjectIds
         if document[key] is None:
             continue
         elif self.is_dict(document[key]):
             self.build_relationships_query(doc_type, key, id, id)
             self.build_nodes_query(key, document[key], id)
         elif self.is_json_array(document[key]):
             for json in self.format_params(document[key]):
                 json_key = key + str(document[key].index(json))
                 self.build_relationships_query(doc_type, json_key, id, id)
                 self.build_nodes_query(json_key, json, id)
         elif self.is_multimensional_array(document[key]):
             parameters.update(self.flatenned_property(key, document[key]))
         else:
             parameters.update({key: self.format_params(document[key])})
     query = "CREATE (c:Document:`{doc_type}` {{parameters}})".format(
         doc_type=doc_type)
     self.query_nodes.update({query: {"parameters": parameters}})
 def bulk_upsert(self, docs, namespace, timestamp):
     """Insert multiple documents into Neo4j."""
     """Maximum chunk size is 1000. Transaction blocks won't have more than 1000 statements."""
     metadata = { "_ts": timestamp }
     tx = self.graph.cypher.begin()
     for doc in docs:
         index, doc_type = self._index_and_mapping(namespace)
         doc_id = u(doc.pop("uid"))
         doc = self._formatter.format_document(doc)
         builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
         self.apply_id_constraint(builder.doc_types)
         for statement in builder.query_nodes.keys():
             tx.append(statement, builder.query_nodes[statement])
         for query in builder.cypher_list:
             tx.append(query)
             # Adding cyphers from cypher list
         for relationship, params in builder.relationships_query:
             tx.append(relationship, params)
         for statement in builder.statements_with_params:
             for key in statement.keys():
                 tx.append(key, statement[key])
     try:
         tx.commit()
     except Exception as e:
         LOG.error('{}'.format(e.message))
         pass
    def iterate_chunks():
        more_chunks = True

        while more_chunks:
            tx = self.graph.cypher.begin()
            metadata = { "_ts": timestamp }
            for i in range(self.chunk_size):
                try:
                    doc = next(docs)
                    index, doc_type = self._index_and_mapping(namespace)
                    doc_id = u(doc.pop("_id"))
                    doc = self._formatter.format_document(doc)
                    builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
                    self.apply_id_constraint(builder.doc_types)
                    for statement in builder.query_nodes.keys():
                        tx.append(statement, builder.query_nodes[statement])
                    for relationship in builder.relationships_query.keys():
                        tx.append(relationship, builder.relationships_query[relationship])
                except StopIteration:
                    more_chunks = False
                    if i > 0:
                        yield tx
                    break
            if more_chunks:
                yield tx
        def iterate_chunks():
            more_chunks = True

            while more_chunks:
                tx = self.graph.cypher.begin()
                metadata = {"_ts": timestamp}
                for i in range(self.chunk_size):
                    try:
                        doc = next(docs)
                        index, doc_type = self._index_and_mapping(namespace)
                        doc_id = u(doc.pop("_id"))
                        doc = self._formatter.format_document(doc)
                        builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
                        self.apply_id_constraint(builder.doc_types)
                        for statement in builder.query_nodes.keys():
                            tx.append(statement, builder.query_nodes[statement])
                        for relationship in builder.relationships_query.keys():
                            tx.append(relationship, builder.relationships_query[relationship])
                    except StopIteration:
                        more_chunks = False
                        if i > 0:
                            yield tx
                        break
                if more_chunks:
                    yield tx
        def docs_to_upsert():
            # Remove metadata and redundant _id
            # index, doc_type = self._index_and_mapping(namespace)
            index = "catalog"
            doc_type = "variant"
            doc_id = u(doc.pop("_id"))
            # Remove parent field
            # parent_id = self._get_parent_id_from_mongodb(index, doc_type,
            #                                              doc)
            document_action = {
                "_index": index,
                "_type": doc_type,
                "_id": doc_id,
                "_source": self._formatter.format_document(doc)
            }
            document_meta = {
                "_index": self.meta_index_name,
                "_type": self.meta_type,
                "_id": doc_id,
                "_source": {
                    "ns": namespace,
                    "_ts": timestamp
                }
            }

            # if parent_id is not None:
            #     document_action["_parent"] = parent_id

            yield document_action
            yield document_meta
    def upsert(self, doc, namespace, timestamp, update_spec=None):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {
            'ns': namespace,
            '_ts': timestamp
        }

        # Index the source document, using lowercase namespace as index name.
        action = {
            '_op_type': 'index',
            '_index': index,
            '_type': doc_type,
            '_id': doc_id,
            '_source': self._formatter.format_document(doc)
        }
        # Index document metadata with original namespace (mixed upper/lower).
        meta_action = {
            '_op_type': 'index',
            '_index': self.meta_index_name,
            '_type': self.meta_type,
            '_id': doc_id,
            '_source': bson.json_util.dumps(metadata)
        }

        self.index(action, meta_action, doc, update_spec)

        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id
Пример #16
0
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        # index, doc_type = self._index_and_mapping(namespace)

        #generate custom document_id
        index, doc_type = self._index_and_mapping(namespace)
        if doc_type == "facility_variant":
            if document_id:
                # document = self.elastic.get(index="catalog", doc_type="variant", id=u(document_id))
                result = self.elastic.search(index="catalog", doc_type="variant",
                                     body={
                                            "query": {
                                                "match" : {"facility_variant_id" : u(document_id)}
                                            }
                                        })
                if result['hits']['total'] == 1:
                    document = result['hits']['hits'][0]
                    if "_source" in document:
                        elasticDoc  = document['_source']
                        if elasticDoc:
                            # import pdb; pdb.set_trace()
                            # variant_id = ObjectId(doc['variant_id'])
                            # variantDoc = m_variant.find_one({"_id" : variant_id})
                            if elasticDoc and "$set" in update_spec:
                                updatedValues = update_spec['$set']
                                for item in updatedValues:
                                    if item in elasticDoc:
                                        elasticDoc[str(item)] = updatedValues[item]
                            else:
                                elasticDoc['status'] = update_spec['status']
                                elasticDoc['comment'] = update_spec['comment']
                                elasticDoc['reason'] = update_spec['reason']
                                elasticDoc['is_available'] = update_spec['is_available']
                                elasticDoc['mrp'] = update_spec['mrp']
                                elasticDoc['selling_price'] = update_spec['selling_price']
                                elasticDoc['discount'] = update_spec['discount']
                            elasticDoc['_id'] = document['_id']
                            elasticDoc['is_direct_update'] = True
                            self.upsert(elasticDoc, namespace, timestamp)


        else:
            if "_id" in update_spec:
                self.upsert(update_spec, namespace, timestamp)
            else:
                # update_spec["_id"] = document_id
                variantDoc = m_variant.find_one({"_id" : document_id})
                if variantDoc and "$set" in update_spec:
                    updatedValues = update_spec['$set']
                    for item in updatedValues:
                        if str(item) == "reason":
                            variantDoc['variant_reason'] = updatedValues[item]
                        else:
                            variantDoc[str(item)] = updatedValues[item]

                    variantDoc['variant_id'] = str(document_id)
                self.upsert(variantDoc, namespace, timestamp)
Пример #17
0
 def docs_to_upsert():
     doc = None
     for doc in docs:
         # Remove metadata and redundant _id
         index, doc_type = self._index_and_mapping(namespace)
         doc_id = u(doc.pop("_id"))
         document_action = {
             '_index': index,
             '_type': doc_type,
             '_id': doc_id,
             'pipeline:': 'geoip',
             '_source': self._formatter.format_document(doc)
         }
         document_meta = {
             '_index': self.meta_index_name,
             '_type': self.meta_type,
             '_id': doc_id,
             'pipeline:': 'geoip',
             '_source': {
                 'ns': namespace,
                 '_ts': timestamp
             }
         }
         yield document_action
         yield document_meta
     if doc is None:
         raise errors.EmptyDocsError(
             "Cannot upsert an empty sequence of "
             "documents into Elastic Search")
Пример #18
0
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """

        index, doc_type = self._index_and_mapping(namespace)
        with self.lock:
            # Check if document source is stored in local buffer
            document = self.BulkBuffer.get_from_sources(
                index, doc_type, u(document_id))
        if document:
            # Document source collected from local buffer
            # Perform apply_update on it and then it will be
            # ready for commiting to Elasticsearch
            updated = self.apply_update(document, update_spec)
            # _id is immutable in MongoDB, so won't have changed in update
            updated['_id'] = document_id
            self.upsert(updated, namespace, timestamp)
        else:
            # Document source needs to be retrieved from Elasticsearch
            # before performing update. Pass update_spec to upsert function
            updated = {"_id": document_id}
            self.upsert(updated, namespace, timestamp, update_spec)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated
    def upsert(self, doc, namespace, timestamp):
        """Insert a document into Elasticsearch."""

        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {"ns": namespace, "_ts": timestamp}

        parent_id = self._get_parent_id_from_mongodb(index, doc_type, doc)
        parent_args = {}
        if parent_id is not None:
            parent_args['parent'] = parent_id

        elasticDocs = elastic_doc(doc)
        for elasticDoc in elasticDocs:
            doc = elasticDoc
            # Index the source document, using lowercase namespace as index name.
            self.elastic.index(index=index,
                               doc_type=doc_type,
                               id=doc_id,
                               body=self._formatter.format_document(doc),
                               refresh=(self.auto_commit_interval == 0),
                               **parent_args)

            # Index document metadata with original namespace (mixed upper/lower).
            self.elastic.index(index=self.meta_index_name,
                               doc_type=self.meta_type,
                               body=bson.json_util.dumps(metadata),
                               id=doc_id,
                               refresh=(self.auto_commit_interval == 0))
            # Leave _id, since it's part of the original document
            doc['_id'] = doc_id
Пример #20
0
    def remove(self, document_id, namespace, timestamp):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=u(document_id),
                         commit=(self.auto_commit_interval == 0))
Пример #21
0
    def upsert(self, doc, namespace, timestamp, update_spec=None):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {'ns': namespace, '_ts': timestamp}

        # Index the source document, using lowercase namespace as index name.
        action = {
            '_op_type': 'index',
            '_index': index,
            '_type': doc_type,
            '_id': doc_id,
            '_source': self._formatter.format_document(doc)
        }
        # Index document metadata with original namespace (mixed upper/lower).
        meta_action = {
            '_op_type': 'index',
            '_index': self.meta_index_name,
            '_type': self.meta_type,
            '_id': doc_id,
            '_source': bson.json_util.dumps(metadata)
        }

        self.index(action, meta_action, doc, update_spec)

        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id
    def upsert(self, doc, namespace, timestamp):
        """Inserts a document into Neo4j."""
        index, doc_type = self._index_and_mapping(namespace)
        doc_id = u(doc.pop("uid"))
        metadata = { "_ts": timestamp }
        doc = self._formatter.format_document(doc)
        builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
        self.apply_id_constraint(builder.doc_types)
        tx = self.graph.cypher.begin()
        for statement in builder.query_nodes.keys():
            tx.append(statement, builder.query_nodes[statement])
        for query in builder.cypher_list:
            tx.append(query)
            # Adding cyphers from cypher list
        for relationship, params in builder.relationships_query:
            tx.append(relationship, params)
        for statement in builder.statements_with_params:
            for key in statement.keys():
                tx.append(key, statement[key])
        commit_result = None
        try:
            commit_result = tx.commit()
            print commit_result
        except Exception as e:
            LOG.error('{}'.format(e.message))
            pass

        if commit_result:
            nodeids_list = self._get_nodeids(commit_result)
            self.create_geospatial_indices(nodeids_list)
 def _upsert(self, cursor, doc, namespace, timestamp):
     doc_id = compat.u(doc["_id"])
     log.debug("Upsert %s into %s", doc_id, namespace)
     cursor.execute(u"""INSERT INTO "{table}" ("{id}", _ts, document) VALUES (%(id)s, %(ts)s, %(doc)s) """
                    u"""ON CONFLICT ("{id}") """
                    u"""DO UPDATE SET (_ts, document) = (%(ts)s, %(doc)s);""".format(table=namespace, id=self.unique_key),
                    {"id": doc_id, "ts": timestamp, "doc": psycopg2.extras.Json(self._formatter.format_document(doc))})
Пример #24
0
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        # Commit outstanding changes so that the document to be updated is the
        # same version to which the changes apply.
        self.commit()
        # Need to escape special characters in the document_id.
        document_id = ''.join(map(
            lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c,
            u(document_id)
        ))

        query = "%s:%s" % (self.unique_key, document_id)
        results = self.solr.search(query)
        if not len(results):
            # Document may not be retrievable yet
            self.commit()
            results = self.solr.search(query)
        # Results is an iterable containing only 1 result
        for doc in results:
            # Remove metadata previously stored by Mongo Connector.
            doc.pop('ns')
            doc.pop('_ts')
            updated = self.apply_update(doc, update_spec)
            # A _version_ of 0 will always apply the update
            updated['_version_'] = 0
            self.upsert(updated, namespace, timestamp)
            return updated
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        self.commit()
        index, doc_type = self._index_and_mapping(namespace)

        if doc_type in self.routing and 'parentField' in self.routing[doc_type]:
            # We can't use get() here and have to do a full search instead.
            # This is due to the fact that Elasticsearch needs the parent ID to
            # know where to route the get request. We might not have the parent
            # ID available in our update request though.
            document = self._search_doc_by_id(index, doc_type, document_id)
            if document is None:
                LOG.error('Could not find document with ID "%s" in Elasticsearch to apply update', u(document_id))
                return None
        else:
            document = self.elastic.get(index=index, doc_type=doc_type,
                                        id=u(document_id))

        updated = self.apply_update(document['_source'], update_spec)
        # _id is immutable in MongoDB, so won't have changed in update
        updated['_id'] = document['_id']
        if '_parent' in document:
            updated['_parent'] = document['_parent']
        self.upsert(updated, namespace, timestamp)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """

        index, doc_type = self._index_and_mapping(namespace)
        with self.lock:
            # Check if document source is stored in local buffer
            document = self.BulkBuffer.get_from_sources(index,
                                                        doc_type,
                                                        u(document_id))
        if document:
            # Document source collected from local buffer
            # Perform apply_update on it and then it will be
            # ready for commiting to Elasticsearch
            updated = self.apply_update(document, update_spec)
            # _id is immutable in MongoDB, so won't have changed in update
            updated['_id'] = document_id
            self.upsert(updated, namespace, timestamp)
        else:
            # Document source needs to be retrieved from Elasticsearch
            # before performing update. Pass update_spec to upsert function
            updated = {"_id": document_id}
            self.upsert(updated, namespace, timestamp, update_spec)
        # upsert() strips metadata, so only _id + fields in _source still here
        return updated
Пример #27
0
    def upsert(self, doc, namespace, timestamp):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        logging.debug("before insert the raw doc is :(%s)" % str(doc))
        docs = self._clean_doc(doc, namespace, timestamp)
        logging.debug("before insert the processed doc is :(%s)" % str(doc))
        if docs is None:
            return None
        if not isinstance(docs, list):
            docs = [docs]
        docid = doc.get("_id")
        #self.remove(docid, namespace, timestamp)
        #delete the child node about this file, TODO
        # if docid :
        #     logging.info("remove solr document which id is %s _* ,timestamp is %s" % (str(docid), str(timestamp)))
        #     self.solr.delete(q=u("_id:"+docid+"_*"),
        #                      commit=(self.auto_commit_interval == 0))
        # else:
        #     raise errors.OperationFailed("delete solr document error for the id(%s) is not valid" % str(docid));
        try:
            if self.auto_commit_interval is not None:
                self.solr.add(docs,
                              commit=(self.auto_commit_interval == 0),
                              commitWithin=u(self.auto_commit_interval))
            else:
                self.solr.add(docs, commit=False)
            logging.debug("insert into solr docs:(%s)" % str(docs))
        except UnicodeDecodeError:
            logging.exception(
                "Unable to process processed document for UnicodeDecodeError, %r "
                % str(docs))
 def build_nodes_query(self, doc_type, document, id):
   self.doc_types.append(doc_type)
   parameters = {'_id':id}
   if self.is_dict(self.metadata):
     parameters.update(self.metadata) 
   for key in document.keys():
     if self.is_reference(key):
       self.build_node_with_reference(doc_type, key, id, document[key])
       continue
     if self.is_objectid(document[key]):
       parameters.update({key: u(document[key])})
       self.build_node_with_reference(doc_type, key, id, document[key])
       continue
     #TODO: handle arrays of ObjectIds
     if document[key] is None:
       continue
     elif self.is_dict(document[key]):
       self.build_relationships_query(doc_type, key, id, id)
       self.build_nodes_query(key, document[key], id)
     elif self.is_json_array(document[key]):
       for json in self.format_params(document[key]):
         json_key = key + str(document[key].index(json))
         self.build_relationships_query(doc_type, json_key, id, id)
         self.build_nodes_query(json_key, json, id)
     elif self.is_multimensional_array(document[key]):
       parameters.update(self.flatenned_property(key, document[key]))
     else:
       parameters.update({ key: self.format_params(document[key]) })
   query = "CREATE (c:Document:`{doc_type}` {{parameters}})".format(doc_type=doc_type)
   self.query_nodes.update({query: {"parameters":parameters}})
 def docs_to_upsert():
     doc = None
     for doc in docs:
         # Remove metadata and redundant _id
         index, doc_type = self._index_and_mapping(namespace)
         doc_id = u(doc.pop("_id"))
         document_action = {
             "_index": index,
             "_type": doc_type,
             "_id": doc_id,
             "_source": self._formatter.format_document(doc)
         }
         document_meta = {
             "_index": self.meta_index_name,
             "_type": self.meta_type,
             "_id": doc_id,
             "_source": {
                 "ns": namespace,
                 "_ts": timestamp
             }
         }
         yield document_action
         yield document_meta
     if doc is None:
         raise errors.EmptyDocsError(
             "Cannot upsert an empty sequence of "
             "documents into Elastic Search")
    def upsert(self, doc, namespace, timestamp):
        """Insert a document into Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)
        # No need to duplicate '_id' in source document
        doc_id = u(doc.pop("_id"))
        metadata = {
            "ns": namespace,
            "_ts": timestamp
        }

        parent_id = self._get_parent_id(doc_type, doc)
        # Index the source document, using lowercase namespace as index name.
        if parent_id is None:
            self.elastic.index(index=index, doc_type=doc_type,
                               body=self._formatter.format_document(doc), id=doc_id,
                               refresh=(self.auto_commit_interval == 0))
        else:
            self.elastic.index(index=index, doc_type=doc_type,
                               body=self._formatter.format_document(doc), id=doc_id,
                               parent=parent_id, refresh=(self.auto_commit_interval == 0))

        # Index document metadata with original namespace (mixed upper/lower).
        self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type,
                           body=bson.json_util.dumps(metadata), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))
        # Leave _id, since it's part of the original document
        doc['_id'] = doc_id
Пример #31
0
    def remove(self, document_id, namespace, timestamp):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=u(document_id),
                         commit=(self.auto_commit_interval == 0))
Пример #32
0
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        # Commit outstanding changes so that the document to be updated is the
        # same version to which the changes apply.
        self.commit()
        # Need to escape special characters in the document_id.
        document_id = ''.join(
            map(lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c,
                u(document_id)))

        query = "%s:%s" % (self.unique_key, document_id)
        results = self.solr.search(query)
        if not len(results):
            # Document may not be retrievable yet
            self.commit()
            results = self.solr.search(query)
        # Results is an iterable containing only 1 result
        for doc in results:
            # Remove metadata previously stored by Mongo Connector.
            if 'ns' in doc:
                doc.pop('ns')
            if '_ts' in doc:
                doc.pop('_ts')
            updated = self.apply_update(doc, update_spec)
            # A _version_ of 0 will always apply the update
            updated['_version_'] = 0
            self.upsert(updated, namespace, timestamp)
            return updated
Пример #33
0
 def docs_to_upsert():
     doc = None
     for doc in docs:
         # Remove metadata and redundant _id
         index, doc_type = self._index_and_mapping(namespace)
         doc_id = u(doc.pop("_id"))
         document_action = {
             "_index": index,
             "_type": doc_type,
             "_id": doc_id,
             "_source": self._formatter.format_document(doc)
         }
         document_meta = {
             "_index": self.meta_index_name,
             "_type": self.meta_type,
             "_id": doc_id,
             "_source": {
                 "ns": index,
                 "_ts": timestamp
             }
         }
         yield document_action
         yield document_meta
     if not doc:
         raise errors.EmptyDocsError(
             "Cannot upsert an empty sequence of "
             "documents into Elastic Search")
Пример #34
0
 def remove(self, document_id, namespace, timestamp):
     """Removes a document from Neo4j."""
     doc_id = u(document_id)
     statement = "CALL json.delete({doc_id})"
     params_dict = {"doc_id": doc_id}
     tx = self.graph.cypher.begin()
     tx.append(statement, params_dict)
     tx.commit()
  def build_node_with_objectid_reference(self, root_type, key, doc_id, document_key):
    if document_key is None:
      return

    parameters = {'_id': u(document_key)}
    statement = "MERGE (d:Document {{_id: {{parameters}}._id}})"
    self.query_nodes.update({statement: {"parameters": parameters}})
    self.build_relationships_query(root_type, 'Document', doc_id, document_key) #FIXME: missing doc_type
Пример #36
0
 def upsert(self, doc, namespace, timestamp):
     """Inserts a document into Neo4j."""
     doc_id = u(doc.pop("_id"))
     statement = "CALL json.upsert({doc_id},{doc})"
     params_dict = {"doc_id": doc_id, "doc": dumps(doc)}
     tx = self.graph.cypher.begin()
     tx.append(statement, params_dict)
     tx.commit()
Пример #37
0
    def _clean_doc(self, doc, namespace, timestamp):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys
          - inserts namespace and timestamp metadata into the document in order
            to handle rollbacks

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = u(doc.pop("_id"))

        # Update namespace and timestamp metadata
        if 'ns' in doc or '_ts' in doc:
            raise errors.OperationFailed(
                'Need to set "ns" and "_ts" fields, but these fields already '
                'exist in the document %r!' % doc)
        doc['ns'] = namespace
        doc['_ts'] = timestamp

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        #flat_doc = self._formatter.format_document(doc)
        flat_doc = doc
        #print flat_doc
        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:

            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field)
                    for regex in self._dynamic_field_regexes)

            return dict(
                (k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc
 def remove(self, doc):
     """Removes the document from the doc dict.
     """
     doc_id = doc["_id"]
     try:
         del self.doc_dict[doc_id]
         self.removed_dict[doc_id] = {"_id": doc_id, "ns": doc["ns"], "_ts": doc["_ts"]}
     except KeyError:
         raise OperationFailed("Document does not exist: %s" % u(doc))
Пример #39
0
	def upsert(self, doc, namespace, timestamp):
		print("In upsert")
		flat_doc = self._clean_doc(doc, namespace, timestamp)
		print(flat_doc)
		docs = self.reformat(flat_doc)
		if self.auto_commit_interval is not None:
			self.solr.add(docs, commit=(self.auto_commit_interval == 0), commitWithin=u(self.auto_commit_interval))
		else:
			self.solr.add(docs, commit=False)
 def remove(self, document_id, namespace, timestamp):
     """Removes a document from Neo4j."""
     doc_id = u(document_id)
     index, doc_type = self._index_and_mapping(namespace)
     params_dict = {"doc_id": doc_id}
     tx = self.graph.cypher.begin()
     statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r"
     tx.append(statement, params_dict)
     tx.commit()
Пример #41
0
 def remove(self, document_id, namespace, timestamp):
   """Removes a document from Neo4j."""
   doc_id = u(document_id)
   index, doc_type = self._index_and_mapping(namespace)
   params_dict = {"doc_id": doc_id}
   tx = self.graph.begin()
   statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r"
   tx.run(statement, params_dict)
   tx.commit()
Пример #42
0
    def build_node_with_objectid_reference(self, root_type, key, doc_id,
                                           document_key):
        if document_key is None:
            return

        parameters = {'_id': u(document_key)}
        statement = "MERGE (d:Document {{_id: {{parameters}}._id}})"
        self.query_nodes.update({statement: {"parameters": parameters}})
        self.build_relationships_query(root_type, 'Document', doc_id,
                                       document_key)  #FIXME: missing doc_type
 def remove(self, document_id, namespace, timestamp):
     """Removes the document from the doc dict.
     """
     try:
         entry = self.doc_dict[document_id]
         entry.doc = None
         entry.update(namespace, timestamp)
     except KeyError:
         raise OperationFailed("Document does not exist: %s" %
                               u(document_id))
Пример #44
0
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        action = {
            '_op_type': 'delete',
            '_index': index,
            '_type': doc_type,
            '_id': u(document_id)
        }

        meta_action = {
            '_op_type': 'delete',
            '_index': self.meta_index_name,
            '_type': self.meta_type,
            '_id': u(document_id)
        }

        self.index(action, meta_action)
 def remove(self, document_id, namespace, timestamp):
     """Removes the document from the doc dict.
     """
     try:
         entry = self.doc_dict[document_id]
         entry.doc = None
         entry.update(namespace, timestamp)
     except KeyError:
         raise OperationFailed("Document does not exist: %s"
                               % u(document_id))
Пример #46
0
 def update(self, document_id, update_spec, namespace, timestamp):
   doc_id = u(document_id)
   tx = self.graph.begin()
   index, doc_type = self._index_and_mapping(namespace)
   updater = NodesAndRelationshipsUpdater()
   updater.run_update(update_spec, doc_id, doc_type.replace("-","_"))
   for statement in updater.statements_with_params:
     for key in statement.keys():
       tx.run(key, statement[key])
   tx.commit()
    def remove(self, document_id, namespace, timestamp):
        """Remove a document from Elasticsearch."""
        index, doc_type = self._index_and_mapping(namespace)

        action = {
            '_op_type': 'delete',
            '_index': index,
            '_type': doc_type,
            '_id': u(document_id)
        }

        meta_action = {
            '_op_type': 'delete',
            '_index': self.meta_index_name,
            '_type': self.meta_type,
            '_id': u(document_id)
        }

        self.index(action, meta_action)
 def update(self, document_id, update_spec, namespace, timestamp):
     doc_id = u(document_id)
     tx = self.graph.cypher.begin()
     index, doc_type = self._index_and_mapping(namespace)
     updater = NodesAndRelationshipsUpdater()
     updater.run_update(update_spec, doc_id, doc_type)
     for statement in updater.statements_with_params:
         for key in statement.keys():
             tx.append(key, statement[key])
     tx.commit()
Пример #49
0
    def _clean_doc(self, doc, namespace, timestamp):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys
          - inserts namespace and timestamp metadata into the document in order
            to handle rollbacks

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = u(doc.pop("_id"))

        # Update namespace and timestamp metadata
        if 'ns' in doc or '_ts' in doc:
            raise errors.OperationFailed(
                'Need to set "ns" and "_ts" fields, but these fields already '
                'exist in the document %r!' % doc)
        doc['ns'] = namespace
        doc['_ts'] = timestamp

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        #flat_doc = self._formatter.format_document(doc)
        flat_doc = doc
        #print flat_doc
        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:
            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field) for regex in self._dynamic_field_regexes
                )
            return dict((k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc
 def update(self, document_id, update_spec, namespace, timestamp, doc=None):
     """Apply updates given in update_spec to the document whose id
     matches that of doc.
     """
     index, doc_type = self._index_and_mapping(namespace)
     document = {}
     if not doc:
         try:
             document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id))
         except es_exceptions.NotFoundError, e:
             return (document_id, e)
 def docs_to_upsert():
     doc = None
     for doc in docs:
         self.mapGeoFields(doc)
         doc_id = u(doc.get("_id"))
         document_action = {
             "_index": index_name,
             "_type": doc_type,
             "_id": doc_id,
             "_source": self._formatter.format_document(doc)
         }
         yield document_action
    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        if self.auto_commit_interval is not None:
            self.solr.add([self._clean_doc(doc)],
                          commit=(self.auto_commit_interval == 0),
                          commitWithin=u(self.auto_commit_interval))
        else:
            self.solr.add([self._clean_doc(doc)], commit=False)
 def update(self, document_id, update_spec, namespace, timestamp):
     """Apply updates given in update_spec to the document whose id
     matches that of doc.
     """
     index, doc_type = self._index_and_mapping(namespace)
     document = self.elastic.get(index=index, doc_type=doc_type,
                                 id=u(document_id))
     updated = self.apply_update(document['_source'], update_spec)
     # _id is immutable in MongoDB, so won't have changed in update
     updated['_id'] = document['_id']
     self.upsert(updated, namespace, timestamp)
     # upsert() strips metadata, so only _id + fields in _source still here
     return updated
 def remove(self, doc):
     """Removes the document from the doc dict.
     """
     doc_id = doc["_id"]
     try:
         del self.doc_dict[doc_id]
         self.removed_dict[doc_id] = {
             '_id': doc_id,
             'ns': doc['ns'],
             '_ts': doc['_ts']
         }
     except KeyError:
         raise OperationFailed("Document does not exist: %s" % u(doc))
 def upsert(self, doc, namespace, timestamp):
     """Inserts a document into Neo4j."""
     index, doc_type = self._index_and_mapping(namespace)
     doc_id = u(doc.pop("_id"))
     metadata = {"_ts": timestamp}
     doc = self._formatter.format_document(doc)
     builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata)
     self.apply_id_constraint(builder.doc_types)
     tx = self.graph.cypher.begin()
     for statement in builder.query_nodes.keys():
         tx.append(statement, builder.query_nodes[statement])
     for relationship in builder.relationships_query.keys():
         tx.append(relationship, builder.relationships_query[relationship])
     tx.commit()
Пример #56
0
 def update(self, document_id, update_spec, namespace, timestamp):
     """Apply updates given in update_spec to the document whose id
     matches that of doc.
     """
     self.commit()
     index, doc_type = self._index_and_mapping(namespace)
     document = self.elastic.get(index=index, doc_type=doc_type,
                                 id=u(document_id))
     updated = self.apply_update(document['_source'], update_spec)
     # _id is immutable in MongoDB, so won't have changed in update
     updated['_id'] = document['_id']
     self.upsert(updated, namespace, timestamp)
     # upsert() strips metadata, so only _id + fields in _source still here
     return updated
 def _search_doc_by_id(self, index, doc_type, doc_id):
     """Search document in Elasticsearch by _id"""
     result = self.elastic.search(index=index, doc_type=doc_type,
                                  body={
                                      'query': {
                                          'ids': {
                                              'type': doc_type,
                                              'values': [u(doc_id)]
                                          }
                                      }
                                  })
     if result['hits']['total'] == 1:
         return result['hits']['hits'][0]
     else:
         return None