def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) if doc_type in self.routing and 'parentField' in self.routing[doc_type]: # We can't use delete() directly here and have to do a full search first. # This is due to the fact that Elasticsearch needs the parent ID to # know where to route the delete request. We might not have the parent # ID available in our remove request though. document = self._search_doc_by_id(index, doc_type, document_id) if document is None: LOG.error('Could not find document with ID "%s" in Elasticsearch to apply remove', u(document_id)) return parent_id = self._get_parent_id(doc_type, document) self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), parent=parent_id, refresh=(self.auto_commit_interval == 0)) else: self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0))
def check_update(update_spec): updated = self.conn.test.command( SON([('findAndModify', 'test'), ('query', {"a": 0}), ('update', update_spec), ('new', True)]))['value'] # Stringify _id to match what will be retrieved from Solr updated[u('_id')] = u(updated['_id']) # Flatten the MongoDB document to match Solr updated = docman._clean_doc(updated, 'dummy.namespace', 0) # Allow some time for update to propagate time.sleep(3) replicated = list(self._search("a:0"))[0] # Remove add'l fields until these are stored in a separate Solr core updated.pop('_ts') replicated.pop('_ts') updated.pop('ns') replicated.pop('ns') # Remove field added by Solr replicated.pop("_version_") self.assertEqual(replicated, updated)
def check_update(update_spec): updated = self.conn.test.command( SON([('findAndModify', 'test'), ('query', { "a": 0 }), ('update', update_spec), ('new', True)]))['value'] # Stringify _id to match what will be retrieved from Solr updated[u('_id')] = u(updated['_id']) # Flatten the MongoDB document to match Solr updated = docman._clean_doc(updated, 'dummy.namespace', 0) # Remove add'l fields until these are stored in a separate Solr core updated.pop('_ts') updated.pop('ns') def update_worked(): replicated = list(self._search("a:0"))[0] # Remove add'l fields until these are stored in a separate # Solr core replicated.pop('_ts') replicated.pop('ns') # Remove field added by Solr replicated.pop("_version_") return replicated == updated # Allow some time for update to propagate assert_soon(update_worked)
def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) parent_args = {} if self._is_child_type(index, doc_type): # We can't use delete() directly here and have to do a full search # first. This is due to the fact that Elasticsearch needs the # parent ID to know where to route the delete request. We do # not have the parent ID available in our remove request though. document = self._search_doc_by_id(index, doc_type, document_id) if document is None: LOG.error('Could not find document with ID "%s" in ' 'Elasticsearch to apply remove', u(document_id)) return parent_id = self._get_parent_id_from_elastic(document) if parent_id is not None: parent_args['parent'] = parent_id self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0), **parent_args) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0))
def remove(self, doc): """Remove a document from Elasticsearch.""" self.elastic.delete(index=doc['ns'], doc_type=self.doc_type, id=u(doc["_id"]), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(doc["_id"]), refresh=(self.auto_commit_interval == 0))
def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0))
def remove(self, document_id, namespace, timestamp): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ if document_id: self.solr.delete(id=u(document_id), commit=(self.auto_commit_interval == 0)) self.solr.delete(q=u("_id:" + document_id + "_*"), commit=(self.auto_commit_interval == 0)) else: raise errors.OperationFailed( "delete solr document error for the id(%s) is not valid" % str(document_id))
def build_nodes_query(self, doc_type, document, id): self.doc_types.append(doc_type) parameters = {'_id': id} if self.is_dict(self.metadata): parameters.update(self.metadata) for key in document.keys(): if self.is_reference(key): self.build_node_with_reference(doc_type, key, id, document[key]) continue if self.is_objectid(document[key]): parameters.update({key: u(document[key])}) self.build_node_with_reference(doc_type, key, id, document[key]) continue #TODO: handle arrays of ObjectIds if document[key] is None: continue elif self.is_dict(document[key]): self.build_relationships_query(doc_type, key, id, id) self.build_nodes_query(key, document[key], id) elif self.is_json_array(document[key]): for json in self.format_params(document[key]): json_key = key + str(document[key].index(json)) self.build_relationships_query(doc_type, json_key, id, id) self.build_nodes_query(json_key, json, id) elif self.is_multimensional_array(document[key]): parameters.update(self.flatenned_property(key, document[key])) else: parameters.update({key: self.format_params(document[key])}) query = "CREATE (c:Document:`{doc_type}` {{parameters}})".format( doc_type=doc_type) self.query_nodes.update({query: {"parameters": parameters}})
def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Neo4j.""" """Maximum chunk size is 1000. Transaction blocks won't have more than 1000 statements.""" metadata = { "_ts": timestamp } tx = self.graph.cypher.begin() for doc in docs: index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("uid")) doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for query in builder.cypher_list: tx.append(query) # Adding cyphers from cypher list for relationship, params in builder.relationships_query: tx.append(relationship, params) for statement in builder.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) try: tx.commit() except Exception as e: LOG.error('{}'.format(e.message)) pass
def iterate_chunks(): more_chunks = True while more_chunks: tx = self.graph.cypher.begin() metadata = { "_ts": timestamp } for i in range(self.chunk_size): try: doc = next(docs) index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) except StopIteration: more_chunks = False if i > 0: yield tx break if more_chunks: yield tx
def iterate_chunks(): more_chunks = True while more_chunks: tx = self.graph.cypher.begin() metadata = {"_ts": timestamp} for i in range(self.chunk_size): try: doc = next(docs) index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) except StopIteration: more_chunks = False if i > 0: yield tx break if more_chunks: yield tx
def docs_to_upsert(): # Remove metadata and redundant _id # index, doc_type = self._index_and_mapping(namespace) index = "catalog" doc_type = "variant" doc_id = u(doc.pop("_id")) # Remove parent field # parent_id = self._get_parent_id_from_mongodb(index, doc_type, # doc) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp } } # if parent_id is not None: # document_action["_parent"] = parent_id yield document_action yield document_meta
def upsert(self, doc, namespace, timestamp, update_spec=None): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = { 'ns': namespace, '_ts': timestamp } # Index the source document, using lowercase namespace as index name. action = { '_op_type': 'index', '_index': index, '_type': doc_type, '_id': doc_id, '_source': self._formatter.format_document(doc) } # Index document metadata with original namespace (mixed upper/lower). meta_action = { '_op_type': 'index', '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, '_source': bson.json_util.dumps(metadata) } self.index(action, meta_action, doc, update_spec) # Leave _id, since it's part of the original document doc['_id'] = doc_id
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() # index, doc_type = self._index_and_mapping(namespace) #generate custom document_id index, doc_type = self._index_and_mapping(namespace) if doc_type == "facility_variant": if document_id: # document = self.elastic.get(index="catalog", doc_type="variant", id=u(document_id)) result = self.elastic.search(index="catalog", doc_type="variant", body={ "query": { "match" : {"facility_variant_id" : u(document_id)} } }) if result['hits']['total'] == 1: document = result['hits']['hits'][0] if "_source" in document: elasticDoc = document['_source'] if elasticDoc: # import pdb; pdb.set_trace() # variant_id = ObjectId(doc['variant_id']) # variantDoc = m_variant.find_one({"_id" : variant_id}) if elasticDoc and "$set" in update_spec: updatedValues = update_spec['$set'] for item in updatedValues: if item in elasticDoc: elasticDoc[str(item)] = updatedValues[item] else: elasticDoc['status'] = update_spec['status'] elasticDoc['comment'] = update_spec['comment'] elasticDoc['reason'] = update_spec['reason'] elasticDoc['is_available'] = update_spec['is_available'] elasticDoc['mrp'] = update_spec['mrp'] elasticDoc['selling_price'] = update_spec['selling_price'] elasticDoc['discount'] = update_spec['discount'] elasticDoc['_id'] = document['_id'] elasticDoc['is_direct_update'] = True self.upsert(elasticDoc, namespace, timestamp) else: if "_id" in update_spec: self.upsert(update_spec, namespace, timestamp) else: # update_spec["_id"] = document_id variantDoc = m_variant.find_one({"_id" : document_id}) if variantDoc and "$set" in update_spec: updatedValues = update_spec['$set'] for item in updatedValues: if str(item) == "reason": variantDoc['variant_reason'] = updatedValues[item] else: variantDoc[str(item)] = updatedValues[item] variantDoc['variant_id'] = str(document_id) self.upsert(variantDoc, namespace, timestamp)
def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { '_index': index, '_type': doc_type, '_id': doc_id, 'pipeline:': 'geoip', '_source': self._formatter.format_document(doc) } document_meta = { '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, 'pipeline:': 'geoip', '_source': { 'ns': namespace, '_ts': timestamp } } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search")
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ index, doc_type = self._index_and_mapping(namespace) with self.lock: # Check if document source is stored in local buffer document = self.BulkBuffer.get_from_sources( index, doc_type, u(document_id)) if document: # Document source collected from local buffer # Perform apply_update on it and then it will be # ready for commiting to Elasticsearch updated = self.apply_update(document, update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document_id self.upsert(updated, namespace, timestamp) else: # Document source needs to be retrieved from Elasticsearch # before performing update. Pass update_spec to upsert function updated = {"_id": document_id} self.upsert(updated, namespace, timestamp, update_spec) # upsert() strips metadata, so only _id + fields in _source still here return updated
def upsert(self, doc, namespace, timestamp): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = {"ns": namespace, "_ts": timestamp} parent_id = self._get_parent_id_from_mongodb(index, doc_type, doc) parent_args = {} if parent_id is not None: parent_args['parent'] = parent_id elasticDocs = elastic_doc(doc) for elasticDoc in elasticDocs: doc = elasticDoc # Index the source document, using lowercase namespace as index name. self.elastic.index(index=index, doc_type=doc_type, id=doc_id, body=self._formatter.format_document(doc), refresh=(self.auto_commit_interval == 0), **parent_args) # Index document metadata with original namespace (mixed upper/lower). self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id
def remove(self, document_id, namespace, timestamp): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=u(document_id), commit=(self.auto_commit_interval == 0))
def upsert(self, doc, namespace, timestamp, update_spec=None): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = {'ns': namespace, '_ts': timestamp} # Index the source document, using lowercase namespace as index name. action = { '_op_type': 'index', '_index': index, '_type': doc_type, '_id': doc_id, '_source': self._formatter.format_document(doc) } # Index document metadata with original namespace (mixed upper/lower). meta_action = { '_op_type': 'index', '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, '_source': bson.json_util.dumps(metadata) } self.index(action, meta_action, doc, update_spec) # Leave _id, since it's part of the original document doc['_id'] = doc_id
def upsert(self, doc, namespace, timestamp): """Inserts a document into Neo4j.""" index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("uid")) metadata = { "_ts": timestamp } doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) tx = self.graph.cypher.begin() for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for query in builder.cypher_list: tx.append(query) # Adding cyphers from cypher list for relationship, params in builder.relationships_query: tx.append(relationship, params) for statement in builder.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) commit_result = None try: commit_result = tx.commit() print commit_result except Exception as e: LOG.error('{}'.format(e.message)) pass if commit_result: nodeids_list = self._get_nodeids(commit_result) self.create_geospatial_indices(nodeids_list)
def _upsert(self, cursor, doc, namespace, timestamp): doc_id = compat.u(doc["_id"]) log.debug("Upsert %s into %s", doc_id, namespace) cursor.execute(u"""INSERT INTO "{table}" ("{id}", _ts, document) VALUES (%(id)s, %(ts)s, %(doc)s) """ u"""ON CONFLICT ("{id}") """ u"""DO UPDATE SET (_ts, document) = (%(ts)s, %(doc)s);""".format(table=namespace, id=self.unique_key), {"id": doc_id, "ts": timestamp, "doc": psycopg2.extras.Json(self._formatter.format_document(doc))})
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ # Commit outstanding changes so that the document to be updated is the # same version to which the changes apply. self.commit() # Need to escape special characters in the document_id. document_id = ''.join(map( lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c, u(document_id) )) query = "%s:%s" % (self.unique_key, document_id) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: # Remove metadata previously stored by Mongo Connector. doc.pop('ns') doc.pop('_ts') updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated, namespace, timestamp) return updated
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() index, doc_type = self._index_and_mapping(namespace) if doc_type in self.routing and 'parentField' in self.routing[doc_type]: # We can't use get() here and have to do a full search instead. # This is due to the fact that Elasticsearch needs the parent ID to # know where to route the get request. We might not have the parent # ID available in our update request though. document = self._search_doc_by_id(index, doc_type, document_id) if document is None: LOG.error('Could not find document with ID "%s" in Elasticsearch to apply update', u(document_id)) return None else: document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] if '_parent' in document: updated['_parent'] = document['_parent'] self.upsert(updated, namespace, timestamp) # upsert() strips metadata, so only _id + fields in _source still here return updated
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ index, doc_type = self._index_and_mapping(namespace) with self.lock: # Check if document source is stored in local buffer document = self.BulkBuffer.get_from_sources(index, doc_type, u(document_id)) if document: # Document source collected from local buffer # Perform apply_update on it and then it will be # ready for commiting to Elasticsearch updated = self.apply_update(document, update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document_id self.upsert(updated, namespace, timestamp) else: # Document source needs to be retrieved from Elasticsearch # before performing update. Pass update_spec to upsert function updated = {"_id": document_id} self.upsert(updated, namespace, timestamp, update_spec) # upsert() strips metadata, so only _id + fields in _source still here return updated
def upsert(self, doc, namespace, timestamp): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ logging.debug("before insert the raw doc is :(%s)" % str(doc)) docs = self._clean_doc(doc, namespace, timestamp) logging.debug("before insert the processed doc is :(%s)" % str(doc)) if docs is None: return None if not isinstance(docs, list): docs = [docs] docid = doc.get("_id") #self.remove(docid, namespace, timestamp) #delete the child node about this file, TODO # if docid : # logging.info("remove solr document which id is %s _* ,timestamp is %s" % (str(docid), str(timestamp))) # self.solr.delete(q=u("_id:"+docid+"_*"), # commit=(self.auto_commit_interval == 0)) # else: # raise errors.OperationFailed("delete solr document error for the id(%s) is not valid" % str(docid)); try: if self.auto_commit_interval is not None: self.solr.add(docs, commit=(self.auto_commit_interval == 0), commitWithin=u(self.auto_commit_interval)) else: self.solr.add(docs, commit=False) logging.debug("insert into solr docs:(%s)" % str(docs)) except UnicodeDecodeError: logging.exception( "Unable to process processed document for UnicodeDecodeError, %r " % str(docs))
def build_nodes_query(self, doc_type, document, id): self.doc_types.append(doc_type) parameters = {'_id':id} if self.is_dict(self.metadata): parameters.update(self.metadata) for key in document.keys(): if self.is_reference(key): self.build_node_with_reference(doc_type, key, id, document[key]) continue if self.is_objectid(document[key]): parameters.update({key: u(document[key])}) self.build_node_with_reference(doc_type, key, id, document[key]) continue #TODO: handle arrays of ObjectIds if document[key] is None: continue elif self.is_dict(document[key]): self.build_relationships_query(doc_type, key, id, id) self.build_nodes_query(key, document[key], id) elif self.is_json_array(document[key]): for json in self.format_params(document[key]): json_key = key + str(document[key].index(json)) self.build_relationships_query(doc_type, json_key, id, id) self.build_nodes_query(json_key, json, id) elif self.is_multimensional_array(document[key]): parameters.update(self.flatenned_property(key, document[key])) else: parameters.update({ key: self.format_params(document[key]) }) query = "CREATE (c:Document:`{doc_type}` {{parameters}})".format(doc_type=doc_type) self.query_nodes.update({query: {"parameters":parameters}})
def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp } } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search")
def upsert(self, doc, namespace, timestamp): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = { "ns": namespace, "_ts": timestamp } parent_id = self._get_parent_id(doc_type, doc) # Index the source document, using lowercase namespace as index name. if parent_id is None: self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) else: self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, parent=parent_id, refresh=(self.auto_commit_interval == 0)) # Index document metadata with original namespace (mixed upper/lower). self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ # Commit outstanding changes so that the document to be updated is the # same version to which the changes apply. self.commit() # Need to escape special characters in the document_id. document_id = ''.join( map(lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c, u(document_id))) query = "%s:%s" % (self.unique_key, document_id) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: # Remove metadata previously stored by Mongo Connector. if 'ns' in doc: doc.pop('ns') if '_ts' in doc: doc.pop('_ts') updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated, namespace, timestamp) return updated
def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } yield document_action yield document_meta if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search")
def remove(self, document_id, namespace, timestamp): """Removes a document from Neo4j.""" doc_id = u(document_id) statement = "CALL json.delete({doc_id})" params_dict = {"doc_id": doc_id} tx = self.graph.cypher.begin() tx.append(statement, params_dict) tx.commit()
def build_node_with_objectid_reference(self, root_type, key, doc_id, document_key): if document_key is None: return parameters = {'_id': u(document_key)} statement = "MERGE (d:Document {{_id: {{parameters}}._id}})" self.query_nodes.update({statement: {"parameters": parameters}}) self.build_relationships_query(root_type, 'Document', doc_id, document_key) #FIXME: missing doc_type
def upsert(self, doc, namespace, timestamp): """Inserts a document into Neo4j.""" doc_id = u(doc.pop("_id")) statement = "CALL json.upsert({doc_id},{doc})" params_dict = {"doc_id": doc_id, "doc": dumps(doc)} tx = self.graph.cypher.begin() tx.append(statement, params_dict) tx.commit()
def _clean_doc(self, doc, namespace, timestamp): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = u(doc.pop("_id")) # Update namespace and timestamp metadata if 'ns' in doc or '_ts' in doc: raise errors.OperationFailed( 'Need to set "ns" and "_ts" fields, but these fields already ' 'exist in the document %r!' % doc) doc['ns'] = namespace doc['_ts'] = timestamp # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key #flat_doc = self._formatter.format_document(doc) flat_doc = doc #print flat_doc # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes) return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc
def remove(self, doc): """Removes the document from the doc dict. """ doc_id = doc["_id"] try: del self.doc_dict[doc_id] self.removed_dict[doc_id] = {"_id": doc_id, "ns": doc["ns"], "_ts": doc["_ts"]} except KeyError: raise OperationFailed("Document does not exist: %s" % u(doc))
def upsert(self, doc, namespace, timestamp): print("In upsert") flat_doc = self._clean_doc(doc, namespace, timestamp) print(flat_doc) docs = self.reformat(flat_doc) if self.auto_commit_interval is not None: self.solr.add(docs, commit=(self.auto_commit_interval == 0), commitWithin=u(self.auto_commit_interval)) else: self.solr.add(docs, commit=False)
def remove(self, document_id, namespace, timestamp): """Removes a document from Neo4j.""" doc_id = u(document_id) index, doc_type = self._index_and_mapping(namespace) params_dict = {"doc_id": doc_id} tx = self.graph.cypher.begin() statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r" tx.append(statement, params_dict) tx.commit()
def remove(self, document_id, namespace, timestamp): """Removes a document from Neo4j.""" doc_id = u(document_id) index, doc_type = self._index_and_mapping(namespace) params_dict = {"doc_id": doc_id} tx = self.graph.begin() statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r" tx.run(statement, params_dict) tx.commit()
def remove(self, document_id, namespace, timestamp): """Removes the document from the doc dict. """ try: entry = self.doc_dict[document_id] entry.doc = None entry.update(namespace, timestamp) except KeyError: raise OperationFailed("Document does not exist: %s" % u(document_id))
def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) action = { '_op_type': 'delete', '_index': index, '_type': doc_type, '_id': u(document_id) } meta_action = { '_op_type': 'delete', '_index': self.meta_index_name, '_type': self.meta_type, '_id': u(document_id) } self.index(action, meta_action)
def update(self, document_id, update_spec, namespace, timestamp): doc_id = u(document_id) tx = self.graph.begin() index, doc_type = self._index_and_mapping(namespace) updater = NodesAndRelationshipsUpdater() updater.run_update(update_spec, doc_id, doc_type.replace("-","_")) for statement in updater.statements_with_params: for key in statement.keys(): tx.run(key, statement[key]) tx.commit()
def update(self, document_id, update_spec, namespace, timestamp): doc_id = u(document_id) tx = self.graph.cypher.begin() index, doc_type = self._index_and_mapping(namespace) updater = NodesAndRelationshipsUpdater() updater.run_update(update_spec, doc_id, doc_type) for statement in updater.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) tx.commit()
def _clean_doc(self, doc, namespace, timestamp): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = u(doc.pop("_id")) # Update namespace and timestamp metadata if 'ns' in doc or '_ts' in doc: raise errors.OperationFailed( 'Need to set "ns" and "_ts" fields, but these fields already ' 'exist in the document %r!' % doc) doc['ns'] = namespace doc['_ts'] = timestamp # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key #flat_doc = self._formatter.format_document(doc) flat_doc = doc #print flat_doc # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes ) return dict((k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc
def update(self, document_id, update_spec, namespace, timestamp, doc=None): """Apply updates given in update_spec to the document whose id matches that of doc. """ index, doc_type = self._index_and_mapping(namespace) document = {} if not doc: try: document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) except es_exceptions.NotFoundError, e: return (document_id, e)
def docs_to_upsert(): doc = None for doc in docs: self.mapGeoFields(doc) doc_id = u(doc.get("_id")) document_action = { "_index": index_name, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } yield document_action
def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=u(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc)], commit=False)
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ index, doc_type = self._index_and_mapping(namespace) document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] self.upsert(updated, namespace, timestamp) # upsert() strips metadata, so only _id + fields in _source still here return updated
def remove(self, doc): """Removes the document from the doc dict. """ doc_id = doc["_id"] try: del self.doc_dict[doc_id] self.removed_dict[doc_id] = { '_id': doc_id, 'ns': doc['ns'], '_ts': doc['_ts'] } except KeyError: raise OperationFailed("Document does not exist: %s" % u(doc))
def upsert(self, doc, namespace, timestamp): """Inserts a document into Neo4j.""" index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) metadata = {"_ts": timestamp} doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) tx = self.graph.cypher.begin() for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) tx.commit()
def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() index, doc_type = self._index_and_mapping(namespace) document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] self.upsert(updated, namespace, timestamp) # upsert() strips metadata, so only _id + fields in _source still here return updated
def _search_doc_by_id(self, index, doc_type, doc_id): """Search document in Elasticsearch by _id""" result = self.elastic.search(index=index, doc_type=doc_type, body={ 'query': { 'ids': { 'type': doc_type, 'values': [u(doc_id)] } } }) if result['hits']['total'] == 1: return result['hits']['hits'][0] else: return None