def bulk_upsert(self, documents, namespace, timestamp):
        LOG.info('Inspecting %s...', namespace)

        if is_mapped(self.mappings, namespace):
            try:
                LOG.info('Mapping found for %s !...', namespace)
                LOG.info('Deleting all rows before update %s !...', namespace)

                db, collection = db_and_collection(namespace)
                for linked_table in self.get_linked_tables(db, collection):
                    sql_delete_rows(self.pgsql.cursor(), linked_table)

                sql_delete_rows(self.pgsql.cursor(), collection)
                self.commit()

                self._bulk_upsert(documents, namespace)
                LOG.info('%s done.', namespace)

            except psycopg2.Error:
                LOG.error(
                    "Impossible to bulk insert documents in namespace %s: %s",
                    namespace, documents)

                if not self.quiet:
                    LOG.error("Traceback:\n%s", traceback.format_exc())
def is_id_autogenerated(mappings, namespace):
    primary_key = get_primary_key(mappings, namespace)

    db, collection = db_and_collection(namespace)
    mapped_to_primary_key = [k for k, v in iteritems(mappings[db][collection]) if
                             'dest' in v and v['dest'] == primary_key]
    return len(mapped_to_primary_key) == 0
예제 #3
0
    def update(self, document_id, update_spec, namespace, timestamp):
        db, collection = db_and_collection(namespace)
        updated_document = self.get_document_by_id(db, collection, document_id)
        primary_key = self.mappings[db][collection]['pk']
        mapped_field = self.mappings[db][collection].get(primary_key, {})
        field_type = mapped_field.get('type')
        doc_id = to_sql_value(document_id, vtype=field_type)

        if updated_document is None:
            return

        for arrayField in get_any_array_fields(self.mappings, db, collection, updated_document):
            dest = self.mappings[db][collection][arrayField]['dest']
            fk = self.mappings[db][collection][arrayField]['fk']
            sql_delete_rows_where(
                self.pgsql.cursor(),
                dest,
                "{0} = {1}".format(fk, doc_id)
            )

        self._upsert(namespace,
                     updated_document,
                     self.pgsql.cursor(), timestamp)

        self.commit()
def is_id_autogenerated(mappings, namespace):
    primary_key = get_primary_key(mappings, namespace)

    db, collection = db_and_collection(namespace)
    mapped_to_primary_key = [
        k for k, v in iteritems(mappings[db][collection])
        if 'dest' in v and v['dest'] == primary_key
    ]
    return len(mapped_to_primary_key) == 0
예제 #5
0
    def _upsert(self, namespace, document, cursor, timestamp):
        db, collection = db_and_collection(namespace)

        mapped_document = get_mapped_document(self.mappings, document, namespace)

        if mapped_document:
            sql_insert(cursor, collection, mapped_document, self.mappings[db][collection]['pk'])

            self._upsert_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp)
            self.upsert_scalar_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp)
예제 #6
0
    def remove(self, document_id, namespace, timestamp):
        if not is_mapped(self.mappings, namespace):
            return

        with self.pgsql.cursor() as cursor:
            db, collection = db_and_collection(namespace)
            primary_key = self.mappings[db][collection]['pk']
            cursor.execute("DELETE from {0} WHERE {1} = '{2}';".format(
                collection.lower(), primary_key, str(document_id)))
            self.commit()
    def remove(self, document_id, namespace, timestamp):
        if not is_mapped(self.mappings, namespace):
            return

        with self.pgsql.cursor() as cursor:
            db, collection = db_and_collection(namespace)
            primary_key = self.mappings[db][collection]['pk']
            cursor.execute(
                "DELETE from {0} WHERE {1} = '{2}';".format(collection.lower(), primary_key, str(document_id))
            )
            self.commit()
    def _upsert(self, namespace, document, cursor, timestamp):
        db, collection = db_and_collection(namespace)

        mapped_document = get_mapped_document(self.mappings, document, namespace)

        if mapped_document:
            primary_key = self.mappings[db][collection]['pk']
            sql_insert(cursor, collection, mapped_document, primary_key)

            self._upsert_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp)
            self.upsert_scalar_array_fields(collection, cursor, db, document, mapped_document, namespace, timestamp)
    def _upsert(self, namespace, document, cursor, timestamp):
        db, collection = db_and_collection(namespace)
        primary_key = self.mappings[db][collection]['pk']

        sql_delete_rows_where(
            cursor, collection,
            '{0} = {1}'.format(primary_key,
                               to_sql_value(document[primary_key])))

        sql_bulk_insert(cursor, self.mappings, namespace, [document])
        self.commit()
    def remove(self, document_id, namespace, timestamp):
        if not is_mapped(self.mappings, namespace):
            return

        with self.pgsql.cursor() as cursor:
            db, collection = db_and_collection(namespace)
            primary_key = self.mappings[db][collection]['pk']
            mapped_field = self.mappings[db][collection].get(primary_key, {})
            field_type = mapped_field.get('type')
            doc_id = to_sql_value(document_id, vtype=field_type)
            cursor.execute("DELETE from {0} WHERE {1} = {2};".format(
                collection.lower(), primary_key, doc_id))
            self.commit()
    def bulk_upsert(self, documents, namespace, timestamp):
        LOG.info('Inspecting %s...', namespace)

        if is_mapped(self.mappings, namespace):
            LOG.info('Mapping found for %s !...', namespace)
            LOG.info('Deleting all rows before update %s !...', namespace)

            db, collection = db_and_collection(namespace)
            for linked_table in self.get_linked_tables(db, collection):
                sql_delete_rows(self.pgsql.cursor(), linked_table)

            sql_delete_rows(self.pgsql.cursor(), collection)
            self.commit()

            self._bulk_upsert(documents, namespace)
            LOG.info('%s done.', namespace)
def get_mapped_document(mappings, document, namespace):
    cleaned_and_flatten_document = _clean_and_flatten_doc(
        mappings, document, namespace)

    db, collection = db_and_collection(namespace)
    keys = list(cleaned_and_flatten_document)

    for key in keys:
        field_mapping = mappings[db][collection][key]

        if 'dest' in field_mapping:
            mappedKey = field_mapping['dest']
            cleaned_and_flatten_document[
                mappedKey] = cleaned_and_flatten_document.pop(key)

    return cleaned_and_flatten_document
예제 #13
0
    def bulk_upsert(self, documents, namespace, timestamp):
        LOG.info('Inspecting %s...', namespace)

        if is_mapped(self.mappings, namespace):
            LOG.info('Mapping found for %s !...', namespace)
            LOG.info('Deleting all rows before update %s !...', namespace)

            db, collection = db_and_collection(namespace)
            for linked_table in self.get_linked_tables(db, collection):
                sql_delete_rows(self.pgsql.cursor(), linked_table)

            sql_delete_rows(self.pgsql.cursor(), collection)
            self.commit()

            self._bulk_upsert(documents, namespace)
            LOG.info('%s done.', namespace)
예제 #14
0
def _sql_bulk_insert(query, mappings, namespace, documents):
    if not documents:
        return

    db, collection = db_and_collection(namespace)

    primary_key = mappings[db][collection]['pk']
    keys = [(k, v['dest']) for k, v in iteritems(mappings[db][collection])
            if 'dest' in v
            and v['type'] not in [ARRAY_TYPE, ARRAY_OF_SCALARS_TYPE]]
    keys.sort(key=lambda x: x[1])

    for document in documents:
        mapped_document = get_mapped_document(mappings, document, namespace)
        values = [
            to_sql_value(extract_creation_date(mapped_document, primary_key),
                         vtype='TIMESTAMP')
        ]

        for key, mapkey in keys:
            field_mapping = mappings[db][collection][key]

            if mapkey in mapped_document:
                values.append(
                    to_sql_value(mapped_document[mapkey],
                                 vtype=field_mapping['type']))

            else:
                values.append(to_sql_value(None, vtype=field_mapping['type']))

        subquery = {
            'collection': collection,
            'document': {
                'raw': document,
                'mapped': mapped_document
            },
            'keys': ['_creationDate'] + [k[1] for k in keys],
            'values': values,
            'pk': primary_key,
            'queries': []
        }
        query.append(subquery)

        insert_document_arrays(collection, subquery['queries'], db, document,
                               mapped_document, mappings, primary_key)
        insert_scalar_arrays(collection, subquery['queries'], db, document,
                             mapped_document, mappings, primary_key)
    def update(self, document_id, update_spec, namespace, timestamp):
        db, collection = db_and_collection(namespace)
        updated_document = self.get_document_by_id(db, collection, document_id)

        if updated_document is None:
            return

        for arrayField in get_any_array_fields(self.mappings, db, collection, updated_document):
            dest = self.mappings[db][collection][arrayField]['dest']
            fk = self.mappings[db][collection][arrayField]['fk']
            sql_delete_rows_where(self.pgsql.cursor(), dest,
                                  "{0} = {1}".format(fk, to_sql_value(document_id)))

        self._upsert(namespace,
                     updated_document,
                     self.pgsql.cursor(), timestamp)

        self.commit()
예제 #16
0
def get_mapped_document(mappings, document, namespace):
    cleaned_and_flatten_document = _clean_and_flatten_doc(mappings, document, namespace)

    db, collection = db_and_collection(namespace)
    keys = list(cleaned_and_flatten_document)

    for key in keys:
        field_mapping = mappings[db][collection][key]

        if 'dest' in field_mapping:
            value = cleaned_and_flatten_document.pop(key)
            if field_mapping['type'] == FIELD_PRESENCE_TYPE:
                value = value is not None

            mappedKey = field_mapping['dest']
            cleaned_and_flatten_document[mappedKey] = value

    return cleaned_and_flatten_document
def get_mapped_document(mappings, document, namespace):
    cleaned_and_flatten_document = _clean_and_flatten_doc(
        mappings, document, namespace)

    db, collection = db_and_collection(namespace)
    keys = list(cleaned_and_flatten_document)

    for key in keys:
        field_mapping = mappings[db][collection][key]

        if 'dest' in field_mapping:
            value = cleaned_and_flatten_document.pop(key)
            if field_mapping['type'] == FIELD_PRESENCE_TYPE:
                value = value is not None

            mappedKey = field_mapping['dest']
            cleaned_and_flatten_document[mappedKey] = value

    return cleaned_and_flatten_document
예제 #18
0
def sql_bulk_insert(cursor, mappings, namespace, documents):
    if not documents:
        return

    db, collection = db_and_collection(namespace)

    primary_key = mappings[db][collection]['pk']
    keys = [
        v['dest'] for k, v in iteritems(mappings[db][collection])
        if 'dest' in v and v['type'] != ARRAY_TYPE
        and v['type'] != ARRAY_OF_SCALARS_TYPE
    ]
    keys.sort()

    values = []

    for document in documents:
        mapped_document = get_mapped_document(mappings, document, namespace)
        document_values = [
            to_sql_value(
                extract_creation_date(mapped_document,
                                      mappings[db][collection]['pk']))
        ]

        if not mapped_document:
            break

        for key in keys:
            if key in mapped_document:
                document_values.append(to_sql_value(mapped_document[key]))
            else:
                document_values.append(to_sql_value(None))
        values.append(u"({0})".format(u','.join(document_values)))

        insert_document_arrays(collection, cursor, db, document,
                               mapped_document, mappings, primary_key)
        insert_scalar_arrays(collection, cursor, db, document, mapped_document,
                             mappings, primary_key)

    if values:
        sql = u"INSERT INTO {0} ({1}) VALUES {2}".format(
            collection, u','.join(['_creationDate'] + keys), u",".join(values))
        cursor.execute(sql)
예제 #19
0
    def update(self, document_id, update_spec, namespace, timestamp):
        # TODO update this to grab doc, apply_update, and then update (return None if doc not in Postgres)

        db, collection = db_and_collection(namespace)
        updated_document = self.get_document_by_id(db, collection, document_id)

        if updated_document is None:
            return

        for arrayField in get_any_array_fields(self.mappings, db, collection, updated_document):
            dest = self.mappings[db][collection][arrayField]['dest']
            fk = self.mappings[db][collection][arrayField]['fk']
            sql_delete_rows_where(self.pgsql.cursor(), dest,
                                  "{0} = {1}".format(fk, to_sql_value(document_id)))

        self._upsert(namespace,
                     updated_document,
                     self.pgsql.cursor(), timestamp)

        self.commit()
def _clean_and_flatten_doc(mappings, doc, namespace):
    """Reformats the given document before insertion into Solr.
    This method reformats the document in the following ways:
      - removes extraneous fields that aren't defined in schema.xml
      - unwinds arrays in order to find and later flatten sub-documents
      - flattens the document so that there are no sub-documents, and every
        value is associated with its dot-separated path of keys
      - inserts namespace and timestamp metadata into the document in order
        to handle rollbacks
    An example:
      {"a": 2,
       "b": {
         "c": {
           "d": 5
         }
       },
       "e": [6, 7, 8]
      }
    becomes:
      {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}
    """

    # PGSQL cannot index fields within sub-documents, so flatten documents
    # with the dot-separated path to each value as the respective key
    flat_doc = _formatter.format_document(doc)

    # Extract column names and mappings for this table
    db, coll = db_and_collection(namespace)
    if db in mappings:
        mappings_db = mappings[db]
        if coll in mappings_db:
            mappings_coll = mappings_db[coll]

            # Only include fields that are explicitly provided in the schema
            def include_field(field):
                return field in mappings_coll

            return dict(
                (k, v) for k, v in flat_doc.items() if include_field(k))
    return {}
예제 #21
0
def sql_bulk_insert(cursor, mappings, namespace, documents):
    if not documents:
        return

    db, collection = db_and_collection(namespace)

    primary_key = mappings[db][collection]['pk']
    keys = unique([
        v['dest'] for k, v in iteritems(mappings[db][collection])
        if 'dest' in v and v['type'] != ARRAY_TYPE
           and v['type'] != ARRAY_OF_SCALARS_TYPE
    ])

    values = []

    for document in documents:
        mapped_document = get_mapped_document(mappings, document, namespace)
        document_values = [to_sql_value(extract_creation_date(mapped_document, mappings[db][collection]['pk']))]

        if not mapped_document:
            break

        for key in keys:
            if key in mapped_document:
                document_values.append(to_sql_value(mapped_document[key]))
            else:
                document_values.append(to_sql_value(None))
        values.append(u"({0})".format(u','.join(document_values)))

        insert_document_arrays(collection, cursor, db, document, mapped_document, mappings, primary_key)
        insert_scalar_arrays(collection, cursor, db, document, mapped_document, mappings, primary_key)

    if values:
        sql = u"INSERT INTO {0} ({1}) VALUES {2}".format(
            collection,
            u','.join(['_creationDate'] + keys),
            u",".join(values)
        )
        cursor.execute(sql)
예제 #22
0
def _clean_and_flatten_doc(mappings, doc, namespace):
    """Reformats the given document before insertion into Solr.
    This method reformats the document in the following ways:
      - removes extraneous fields that aren't defined in schema.xml
      - unwinds arrays in order to find and later flatten sub-documents
      - flattens the document so that there are no sub-documents, and every
        value is associated with its dot-separated path of keys
      - inserts namespace and timestamp metadata into the document in order
        to handle rollbacks
    An example:
      {"a": 2,
       "b": {
         "c": {
           "d": 5
         }
       },
       "e": [6, 7, 8]
      }
    becomes:
      {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}
    """

    # PGSQL cannot index fields within sub-documents, so flatten documents
    # with the dot-separated path to each value as the respective key
    flat_doc = _formatter.format_document(doc)

    # Extract column names and mappings for this table
    db, coll = db_and_collection(namespace)
    if db in mappings:
        mappings_db = mappings[db]
        if coll in mappings_db:
            mappings_coll = mappings_db[coll]

            # Only include fields that are explicitly provided in the schema
            def include_field(field):
                return field in mappings_coll

            return dict((k, v) for k, v in flat_doc.items() if include_field(k))
    return {}
    def test_db_and_collection(self):
        ns = 'db.col'
        got = utils.db_and_collection(ns)

        self.assertEqual(len(got), 2)
        self.assertEqual(got, ['db', 'col'])
def get_mapped_field(mappings, namespace, field_name):
    db, collection = db_and_collection(namespace)
    return mappings[db][collection][field_name]['dest']
def get_primary_key(mappings, namespace):
    db, collection = db_and_collection(namespace)
    return mappings[db][collection]['pk']
예제 #26
0
def is_mapped(mappings, namespace, field_name=None):
    db, collection = db_and_collection(namespace)
    return db in mappings and collection in mappings[db] and \
           (field_name is None or field_name in mappings[db][collection])
예제 #27
0
def get_primary_key(mappings, namespace):
    db, collection = db_and_collection(namespace)
    return mappings[db][collection]['pk']
예제 #28
0
def get_mapped_field(mappings, namespace, field_name):
    db, collection = db_and_collection(namespace)
    return mappings[db][collection][field_name]['dest']
def is_mapped(mappings, namespace, field_name=None):
    db, collection = db_and_collection(namespace)
    return db in mappings and collection in mappings[db] and \
           (field_name is None or field_name in mappings[db][collection])
예제 #30
0
    def test_db_and_collection(self):
        ns = 'db.col'
        got = utils.db_and_collection(ns)

        self.assertEqual(len(got), 2)
        self.assertEqual(got, ['db', 'col'])