def test_flattener(self): formatter = DocumentFlattener() # Flat already transformed = dict( (k, formatter.transform_value(v)) for k, v in self.doc.items() ) self.assertEqual(transformed, formatter.format_document(self.doc)) # Nested transformed2 = formatter.format_document(self.doc_nested) constructed = dict( ("doc.%s" % k, formatter.transform_value(v)) for k, v in self.doc.items() ) self.assertEqual(transformed2, constructed) # With a list constructed1 = dict( ("list.0.%s" % k, formatter.transform_value(v)) for k, v in self.doc.items() ) constructed2 = dict(("list.1.%s" % k, v) for k, v in transformed2.items()) constructed3 = dict( ("list.2.%d" % i, formatter.transform_value(v)) for i, v in enumerate(self.lst) ) constructed1.update(constructed2) constructed1.update(constructed3) self.assertEqual(formatter.format_document(self.doc_list), constructed1)
def __init__(self, url, unique_key='_id', auto_commit_interval=None, chunk_size=100, **kwargs): if 'mongoUrl' not in kwargs: raise InvalidConfiguration("The MongoUrl parameter is mandatory.") self.url = url self.unique_key = unique_key self.auto_commit_interval = auto_commit_interval self.chunk_size = chunk_size self._formatter = DocumentFlattener() self.pgsql = psycopg2.connect(url) self.insert_accumulator = {} self.client = MongoClient(kwargs['mongoUrl']) self.quiet = kwargs.get('quiet', False) mappings_json_file_name = kwargs.get('mappingFile', DEFAULT_MAPPINGS_JSON_FILE_NAME) register_adapter(ObjectId, object_id_adapter) if not os.path.isfile(mappings_json_file_name): raise InvalidConfiguration("no mapping file found at " + mappings_json_file_name) with open(mappings_json_file_name) as mappings_file: self.mappings = json.load(mappings_file) validate_mapping(self.mappings) self.pgsql.set_session(deferrable=True) self._init_schema()
def __init__(self, url, unique_key='_id', auto_commit_interval=None, chunk_size=100, **kwargs): if 'mongoUrl' not in kwargs: raise InvalidConfiguration("The MongoUrl parameter is mandatory.") self.url = url self.unique_key = unique_key self.auto_commit_interval = auto_commit_interval self.chunk_size = chunk_size self._formatter = DocumentFlattener() self.pgsql = psycopg2.connect(url) self.insert_accumulator = {} self.client = MongoClient(kwargs['mongoUrl']) register_adapter(ObjectId, object_id_adapter) if not os.path.isfile(MAPPINGS_JSON_FILE_NAME): raise InvalidConfiguration("no mapping file found") with open(MAPPINGS_JSON_FILE_NAME) as mappings_file: self.mappings = json.load(mappings_file) self._init_schema()
def __init__(self, url, unique_key='_id', auto_commit_interval=None, chunk_size=100, **kwargs): print kwargs # if 'mongoUrl' not in kwargs: # raise InvalidConfiguration("The MongoUrl parameter is mandatory.") self.url = url self.unique_key = unique_key self.auto_commit_interval = auto_commit_interval self.chunk_size = chunk_size self._formatter = DocumentFlattener() self.pgsql = psycopg2.connect(url) self.insert_accumulator = {} # self.client = MongoClient(kwargs['mongoUrl']) register_adapter(ObjectId, object_id_adapter) # TODO - remove mapping checks as we are just going to # translate to jsonb with an id column if not os.path.isfile(MAPPINGS_JSON_FILE_NAME): raise InvalidConfiguration("no mapping file found") with open(MAPPINGS_JSON_FILE_NAME) as mappings_file: self.mappings = json.load(mappings_file) validate_mapping(self.mappings) # TODO - this should create a table with an id pk column and # also a jsonb 'jdoc' column only self._init_schema()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.url = url self.solr = Solr(url, **kwargs.get('clientOptions', {})) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener() self._content_type = kwargs.get("content_type", None) logging.info("begin to init content_type args ,value is %s" % str(self._content_type)) if self._content_type is None: logging.info("content_type args is none, will receive all type") self._receive_all_type = True else: logging.debug("begin to check content_type args") self._receive_all_type = False if isinstance(self._content_type, dict): self._content_type_list = dict(self._content_type).keys() logging.debug("the support type list is %s" % str(self._content_type_list)) else: raise errors.InvalidConfiguration( "args content type is not is dict")
# coding: utf8 from future.utils import iteritems from mongo_connector.doc_managers.formatters import DocumentFlattener from mongo_connector.doc_managers.utils import db_and_collection, ARRAY_OF_SCALARS_TYPE _formatter = DocumentFlattener() def _clean_and_flatten_doc(mappings, doc, namespace): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """