def bulk_upsert(self, documents, namespace, timestamp): LOG.info('Inspecting %s...', namespace) if is_mapped(self.mappings, namespace): try: LOG.info('Mapping found for %s !...', namespace) LOG.info('Deleting all rows before update %s !...', namespace) db, collection = db_and_collection(namespace) for linked_table in self.get_linked_tables(db, collection): sql_delete_rows(self.pgsql.cursor(), linked_table) sql_delete_rows(self.pgsql.cursor(), collection) self.commit() self._bulk_upsert(documents, namespace) LOG.info('%s done.', namespace) except psycopg2.Error: LOG.error( "Impossible to bulk insert documents in namespace %s: %s", namespace, documents) if not self.quiet: LOG.error("Traceback:\n%s", traceback.format_exc())
def is_id_autogenerated(mappings, namespace): primary_key = get_primary_key(mappings, namespace) db, collection = db_and_collection(namespace) mapped_to_primary_key = [k for k, v in iteritems(mappings[db][collection]) if 'dest' in v and v['dest'] == primary_key] return len(mapped_to_primary_key) == 0
def update(self, document_id, update_spec, namespace, timestamp): db, collection = db_and_collection(namespace) updated_document = self.get_document_by_id(db, collection, document_id) primary_key = self.mappings[db][collection]['pk'] mapped_field = self.mappings[db][collection].get(primary_key, {}) field_type = mapped_field.get('type') doc_id = to_sql_value(document_id, vtype=field_type) if updated_document is None: return for arrayField in get_any_array_fields(self.mappings, db, collection, updated_document): dest = self.mappings[db][collection][arrayField]['dest'] fk = self.mappings[db][collection][arrayField]['fk'] sql_delete_rows_where( self.pgsql.cursor(), dest, "{0} = {1}".format(fk, doc_id) ) self._upsert(namespace, updated_document, self.pgsql.cursor(), timestamp) self.commit()
def is_id_autogenerated(mappings, namespace): primary_key = get_primary_key(mappings, namespace) db, collection = db_and_collection(namespace) mapped_to_primary_key = [ k for k, v in iteritems(mappings[db][collection]) if 'dest' in v and v['dest'] == primary_key ] return len(mapped_to_primary_key) == 0
def _upsert(self, namespace, document, cursor, timestamp): db, collection = db_and_collection(namespace) mapped_document = get_mapped_document(self.mappings, document, namespace) if mapped_document: sql_insert(cursor, collection, mapped_document, self.mappings[db][collection]['pk']) self._upsert_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp) self.upsert_scalar_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp)
def remove(self, document_id, namespace, timestamp): if not is_mapped(self.mappings, namespace): return with self.pgsql.cursor() as cursor: db, collection = db_and_collection(namespace) primary_key = self.mappings[db][collection]['pk'] cursor.execute("DELETE from {0} WHERE {1} = '{2}';".format( collection.lower(), primary_key, str(document_id))) self.commit()
def remove(self, document_id, namespace, timestamp): if not is_mapped(self.mappings, namespace): return with self.pgsql.cursor() as cursor: db, collection = db_and_collection(namespace) primary_key = self.mappings[db][collection]['pk'] cursor.execute( "DELETE from {0} WHERE {1} = '{2}';".format(collection.lower(), primary_key, str(document_id)) ) self.commit()
def _upsert(self, namespace, document, cursor, timestamp): db, collection = db_and_collection(namespace) mapped_document = get_mapped_document(self.mappings, document, namespace) if mapped_document: primary_key = self.mappings[db][collection]['pk'] sql_insert(cursor, collection, mapped_document, primary_key) self._upsert_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp) self.upsert_scalar_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp)
def _upsert(self, namespace, document, cursor, timestamp): db, collection = db_and_collection(namespace) primary_key = self.mappings[db][collection]['pk'] sql_delete_rows_where( cursor, collection, '{0} = {1}'.format(primary_key, to_sql_value(document[primary_key]))) sql_bulk_insert(cursor, self.mappings, namespace, [document]) self.commit()
def remove(self, document_id, namespace, timestamp): if not is_mapped(self.mappings, namespace): return with self.pgsql.cursor() as cursor: db, collection = db_and_collection(namespace) primary_key = self.mappings[db][collection]['pk'] mapped_field = self.mappings[db][collection].get(primary_key, {}) field_type = mapped_field.get('type') doc_id = to_sql_value(document_id, vtype=field_type) cursor.execute("DELETE from {0} WHERE {1} = {2};".format( collection.lower(), primary_key, doc_id)) self.commit()
def bulk_upsert(self, documents, namespace, timestamp): LOG.info('Inspecting %s...', namespace) if is_mapped(self.mappings, namespace): LOG.info('Mapping found for %s !...', namespace) LOG.info('Deleting all rows before update %s !...', namespace) db, collection = db_and_collection(namespace) for linked_table in self.get_linked_tables(db, collection): sql_delete_rows(self.pgsql.cursor(), linked_table) sql_delete_rows(self.pgsql.cursor(), collection) self.commit() self._bulk_upsert(documents, namespace) LOG.info('%s done.', namespace)
def get_mapped_document(mappings, document, namespace): cleaned_and_flatten_document = _clean_and_flatten_doc( mappings, document, namespace) db, collection = db_and_collection(namespace) keys = list(cleaned_and_flatten_document) for key in keys: field_mapping = mappings[db][collection][key] if 'dest' in field_mapping: mappedKey = field_mapping['dest'] cleaned_and_flatten_document[ mappedKey] = cleaned_and_flatten_document.pop(key) return cleaned_and_flatten_document
def _sql_bulk_insert(query, mappings, namespace, documents): if not documents: return db, collection = db_and_collection(namespace) primary_key = mappings[db][collection]['pk'] keys = [(k, v['dest']) for k, v in iteritems(mappings[db][collection]) if 'dest' in v and v['type'] not in [ARRAY_TYPE, ARRAY_OF_SCALARS_TYPE]] keys.sort(key=lambda x: x[1]) for document in documents: mapped_document = get_mapped_document(mappings, document, namespace) values = [ to_sql_value(extract_creation_date(mapped_document, primary_key), vtype='TIMESTAMP') ] for key, mapkey in keys: field_mapping = mappings[db][collection][key] if mapkey in mapped_document: values.append( to_sql_value(mapped_document[mapkey], vtype=field_mapping['type'])) else: values.append(to_sql_value(None, vtype=field_mapping['type'])) subquery = { 'collection': collection, 'document': { 'raw': document, 'mapped': mapped_document }, 'keys': ['_creationDate'] + [k[1] for k in keys], 'values': values, 'pk': primary_key, 'queries': [] } query.append(subquery) insert_document_arrays(collection, subquery['queries'], db, document, mapped_document, mappings, primary_key) insert_scalar_arrays(collection, subquery['queries'], db, document, mapped_document, mappings, primary_key)
def update(self, document_id, update_spec, namespace, timestamp): db, collection = db_and_collection(namespace) updated_document = self.get_document_by_id(db, collection, document_id) if updated_document is None: return for arrayField in get_any_array_fields(self.mappings, db, collection, updated_document): dest = self.mappings[db][collection][arrayField]['dest'] fk = self.mappings[db][collection][arrayField]['fk'] sql_delete_rows_where(self.pgsql.cursor(), dest, "{0} = {1}".format(fk, to_sql_value(document_id))) self._upsert(namespace, updated_document, self.pgsql.cursor(), timestamp) self.commit()
def get_mapped_document(mappings, document, namespace): cleaned_and_flatten_document = _clean_and_flatten_doc(mappings, document, namespace) db, collection = db_and_collection(namespace) keys = list(cleaned_and_flatten_document) for key in keys: field_mapping = mappings[db][collection][key] if 'dest' in field_mapping: value = cleaned_and_flatten_document.pop(key) if field_mapping['type'] == FIELD_PRESENCE_TYPE: value = value is not None mappedKey = field_mapping['dest'] cleaned_and_flatten_document[mappedKey] = value return cleaned_and_flatten_document
def get_mapped_document(mappings, document, namespace): cleaned_and_flatten_document = _clean_and_flatten_doc( mappings, document, namespace) db, collection = db_and_collection(namespace) keys = list(cleaned_and_flatten_document) for key in keys: field_mapping = mappings[db][collection][key] if 'dest' in field_mapping: value = cleaned_and_flatten_document.pop(key) if field_mapping['type'] == FIELD_PRESENCE_TYPE: value = value is not None mappedKey = field_mapping['dest'] cleaned_and_flatten_document[mappedKey] = value return cleaned_and_flatten_document
def sql_bulk_insert(cursor, mappings, namespace, documents): if not documents: return db, collection = db_and_collection(namespace) primary_key = mappings[db][collection]['pk'] keys = [ v['dest'] for k, v in iteritems(mappings[db][collection]) if 'dest' in v and v['type'] != ARRAY_TYPE and v['type'] != ARRAY_OF_SCALARS_TYPE ] keys.sort() values = [] for document in documents: mapped_document = get_mapped_document(mappings, document, namespace) document_values = [ to_sql_value( extract_creation_date(mapped_document, mappings[db][collection]['pk'])) ] if not mapped_document: break for key in keys: if key in mapped_document: document_values.append(to_sql_value(mapped_document[key])) else: document_values.append(to_sql_value(None)) values.append(u"({0})".format(u','.join(document_values))) insert_document_arrays(collection, cursor, db, document, mapped_document, mappings, primary_key) insert_scalar_arrays(collection, cursor, db, document, mapped_document, mappings, primary_key) if values: sql = u"INSERT INTO {0} ({1}) VALUES {2}".format( collection, u','.join(['_creationDate'] + keys), u",".join(values)) cursor.execute(sql)
def update(self, document_id, update_spec, namespace, timestamp): # TODO update this to grab doc, apply_update, and then update (return None if doc not in Postgres) db, collection = db_and_collection(namespace) updated_document = self.get_document_by_id(db, collection, document_id) if updated_document is None: return for arrayField in get_any_array_fields(self.mappings, db, collection, updated_document): dest = self.mappings[db][collection][arrayField]['dest'] fk = self.mappings[db][collection][arrayField]['fk'] sql_delete_rows_where(self.pgsql.cursor(), dest, "{0} = {1}".format(fk, to_sql_value(document_id))) self._upsert(namespace, updated_document, self.pgsql.cursor(), timestamp) self.commit()
def _clean_and_flatten_doc(mappings, doc, namespace): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # PGSQL cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = _formatter.format_document(doc) # Extract column names and mappings for this table db, coll = db_and_collection(namespace) if db in mappings: mappings_db = mappings[db] if coll in mappings_db: mappings_coll = mappings_db[coll] # Only include fields that are explicitly provided in the schema def include_field(field): return field in mappings_coll return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return {}
def sql_bulk_insert(cursor, mappings, namespace, documents): if not documents: return db, collection = db_and_collection(namespace) primary_key = mappings[db][collection]['pk'] keys = unique([ v['dest'] for k, v in iteritems(mappings[db][collection]) if 'dest' in v and v['type'] != ARRAY_TYPE and v['type'] != ARRAY_OF_SCALARS_TYPE ]) values = [] for document in documents: mapped_document = get_mapped_document(mappings, document, namespace) document_values = [to_sql_value(extract_creation_date(mapped_document, mappings[db][collection]['pk']))] if not mapped_document: break for key in keys: if key in mapped_document: document_values.append(to_sql_value(mapped_document[key])) else: document_values.append(to_sql_value(None)) values.append(u"({0})".format(u','.join(document_values))) insert_document_arrays(collection, cursor, db, document, mapped_document, mappings, primary_key) insert_scalar_arrays(collection, cursor, db, document, mapped_document, mappings, primary_key) if values: sql = u"INSERT INTO {0} ({1}) VALUES {2}".format( collection, u','.join(['_creationDate'] + keys), u",".join(values) ) cursor.execute(sql)
def _clean_and_flatten_doc(mappings, doc, namespace): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # PGSQL cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = _formatter.format_document(doc) # Extract column names and mappings for this table db, coll = db_and_collection(namespace) if db in mappings: mappings_db = mappings[db] if coll in mappings_db: mappings_coll = mappings_db[coll] # Only include fields that are explicitly provided in the schema def include_field(field): return field in mappings_coll return dict((k, v) for k, v in flat_doc.items() if include_field(k)) return {}
def test_db_and_collection(self): ns = 'db.col' got = utils.db_and_collection(ns) self.assertEqual(len(got), 2) self.assertEqual(got, ['db', 'col'])
def get_mapped_field(mappings, namespace, field_name): db, collection = db_and_collection(namespace) return mappings[db][collection][field_name]['dest']
def get_primary_key(mappings, namespace): db, collection = db_and_collection(namespace) return mappings[db][collection]['pk']
def is_mapped(mappings, namespace, field_name=None): db, collection = db_and_collection(namespace) return db in mappings and collection in mappings[db] and \ (field_name is None or field_name in mappings[db][collection])