def insert(self, new_values): """ Insert new documents into Elastic Search """ if self.rowid_column not in new_values: log2pg( 'INSERT requires "{rowid}" column. Missing in: {values}'. format(rowid=self.rowid_column, values=new_values), logging.ERROR) return (0, 0) document_id = new_values[self.rowid_column] new_values.pop(self.rowid_column, None) try: response = self.client.index(index=self.index, doc_type=self.doc_type, id=document_id, body=new_values) return response except Exception as exception: log2pg( "INDEX for /{index}/{doc_type}/{document_id} and document {document} failed: {exception}" .format(index=self.index, doc_type=self.doc_type, document_id=document_id, document=new_values, exception=exception), logging.ERROR) return (0, 0)
def insert(self, new_values): """ Insert new documents into Elastic Search """ if self.rowid_column not in new_values: log2pg( 'INSERT requires "{rowid}" column. Missing in: {values}'.format( rowid=self.rowid_column, values=new_values ), logging.ERROR, ) return (0, 0) document_id = new_values[self.rowid_column] new_values.pop(self.rowid_column, None) try: response = self.client.index(index=self.index, doc_type=self.doc_type, id=document_id, body=new_values) return response except Exception as exception: log2pg( "INDEX for /{index}/{doc_type}/{document_id} and document {document} failed: {exception}".format( index=self.index, doc_type=self.doc_type, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0)
def insert(self, new_values): """ Insert new documents into Elastic Search """ if self.rowid_column not in new_values: log2pg( 'INSERT requires "{rowid}" column. Missing in: {values}'. format(rowid=self.rowid_column, values=new_values), logging.ERROR, ) return (0, 0) document_id = new_values[self.rowid_column] new_values.pop(self.rowid_column, None) for key in self.json_columns.intersection(new_values.keys()): new_values[key] = json.loads(new_values[key]) try: response = self.client.index(id=document_id, body=new_values, **self.arguments) return response except Exception as exception: log2pg( "INDEX for {path}/{document_id} and document {document} failed: {exception}" .format( path=self.path, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0)
def _read_by_id(self, row_id): try: arguments = dict(self.arguments) results = self.client.search( body={"query": { "ids": { "values": [row_id] } }}, **arguments)["hits"]["hits"] if results: return self._convert_response_row(results[0], self.columns, None, None) log2pg( "SEARCH for {path} row_id {row_id} returned nothing".format( path=self.path, row_id=row_id), logging.WARNING, ) return {self.rowid_column: row_id} except Exception as exception: log2pg( "SEARCH for {path} row_id {row_id} failed: {exception}".format( path=self.path, row_id=row_id, exception=exception), logging.ERROR, ) return {}
def update(self, document_id, new_values): """ Update existing documents in Elastic Search """ new_values.pop(self.rowid_column, None) for key in self.json_columns.intersection(new_values.keys()): new_values[key] = json.loads(new_values[key]) try: response = self.client.index(id=document_id, body=new_values, refresh=self.refresh, **self.arguments) if self.complete_returning: return self._read_by_id(response["_id"]) return {self.rowid_column: response["_id"]} except Exception as exception: log2pg( "INDEX for {path}/{document_id} and document {document} failed: {exception}" .format( path=self.path, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0)
def build_spec(self, quals): Q = {} comp_mapper = { '>': '$gt', '>=': '$gte', '<=': '$lte', '<': '$lt', '<>': '$ne' } for qual in quals: val_formatter = self.fields[qual.field_name]['formatter'] vform = lambda val: val_formatter( val) if val_formatter is not None else val if qual.operator == '=': Q[qual.field_name] = vform(qual.value) elif qual.operator in comp_mapper: comp = Q.setdefault(qual.field_name, {}) comp[comp_mapper[qual.operator]] = vform(qual.value) Q[qual.field_name] = comp else: log2pg('Qual operator {} not implemented yet: {}'.format( qual.field_name, qual)) return Q
def execute(self, quals, columns): """ Execute the query """ try: query = self._get_query(quals) if query: response = self.client.search(index=self.index, doc_type=self.doc_type, size=self.scroll_size, scroll=self.scroll_duration, q=query) else: response = self.client.search(index=self.index, doc_type=self.doc_type, size=self.scroll_size, scroll=self.scroll_duration) while True: scroll_id = response['_scroll_id'] for result in response['hits']['hits']: yield self._convert_response_row(result, columns, query) if len(response['hits']['hits']) < self.scroll_size: return response = self.client.scroll(scroll_id=scroll_id, scroll=self.scroll_duration) except Exception as exception: log2pg( "SEARCH for /{index}/{doc_type} failed: {exception}".format( index=self.index, doc_type=self.doc_type, exception=exception), logging.ERROR) return
def execute(self, quals, columns): """ Execute the query """ try: query = self._get_query(quals) if query: response = self.client.search(size=self.scroll_size, scroll=self.scroll_duration, q=query, **self.arguments) else: response = self.client.search(size=self.scroll_size, scroll=self.scroll_duration, **self.arguments) while True: scroll_id = response["_scroll_id"] for result in response["hits"]["hits"]: yield self._convert_response_row(result, columns, query) if len(response["hits"]["hits"]) < self.scroll_size: return response = self.client.scroll(scroll_id=scroll_id, scroll=self.scroll_duration) except Exception as exception: log2pg( "SEARCH for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return
def insert(self, new_values): log2pg('MARK Insert Request - new values: %s' % new_values, logging.DEBUG) if not 'id' in new_values: log2pg('INSERT requires "id" column. Missing in: %s' % new_values, logging.ERROR) id = new_values['id'] new_values.pop('id', None) return self.es_index(id, new_values)
def delete(self, id): conn = httplib.HTTPConnection(self.host, self.port) conn.request("DELETE", "/%s/%s/%s" % (self.node, self.index, id)) resp = conn.getresponse() if not 200 == resp.status: log2pg('Failed to delete: %s' % resp.read(), logging.ERROR) return raw = resp.read() return json.loads(raw)
def insert(self, new_values): log2pg('MARK Insert Request - new values: %s' % new_values, logging.DEBUG) if not self.rowid_column in new_values: log2pg( 'INSERT requires "%s" column. Missing in: %s' % (self.rowid_colum, new_values), logging.ERROR) id = new_values.pop(self.rowid_column) return self.es_index(id, new_values)
def delete(self, document_id): """ Delete documents from Elastic Search """ try: response = self.client.delete(id=document_id, **self.arguments) return response except Exception as exception: log2pg( "DELETE for {path}/{document_id} failed: {exception}".format( path=self.path, document_id=document_id, exception=exception ), logging.ERROR, ) return (0, 0)
def delete(self, document_id): """ Delete documents from Elastic Search """ try: response = self.client.delete(index=self.index, doc_type=self.doc_type, id=document_id) return response except Exception as exception: log2pg( "DELETE for /{index}/{doc_type}/{document_id} failed: {exception}".format( index=self.index, doc_type=self.doc_type, document_id=document_id, exception=exception ), logging.ERROR, ) return (0, 0)
def execute(self, quals, columns, aggs=None, group_clauses=None): """ Execute the query """ try: query, query_string = self._get_query(quals, aggs=aggs, group_clauses=group_clauses) is_aggregation = aggs or group_clauses if query: response = self.client.search( size=self.scroll_size if not is_aggregation else 0, scroll=self.scroll_duration if not is_aggregation else None, body=query, **self.arguments) else: response = self.client.search(size=self.scroll_size, scroll=self.scroll_duration, **self.arguments) if not response["hits"]["hits"] and not is_aggregation: return if is_aggregation: yield from self._handle_aggregation_response( query, response, aggs, group_clauses) return while True: self.scroll_id = response["_scroll_id"] for result in response["hits"]["hits"]: yield self._convert_response_row(result, columns, query_string) if len(response["hits"]["hits"]) < self.scroll_size: return response = self.client.scroll(scroll_id=self.scroll_id, scroll=self.scroll_duration) except Exception as exception: log2pg( "SEARCH for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return
def get_rel_size(self, quals, columns): """ Helps the planner by returning costs. Returns a tuple of the form (number of rows, average row width) """ try: query = self._get_query(quals) q_dict = json.loads(query.encode('utf-8')) response = self.client.count(body=q_dict, index=self.index) return (response["count"], len(columns) * 100) except Exception as exception: log2pg( "COUNT for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return (0, 0)
def delete(self, document_id): """ Delete documents from Elastic Search """ try: response = self.client.delete(index=self.index, doc_type=self.doc_type, id=document_id) return response except Exception as exception: log2pg( "DELETE for /{index}/{doc_type}/{document_id} failed: {exception}" .format(index=self.index, doc_type=self.doc_type, document_id=document_id, exception=exception), logging.ERROR) return (0, 0)
def get_rel_size(self, quals, columns): """ Helps the planner by returning costs. Returns a tuple of the form (number of rows, average row width) """ try: query = self._get_query(quals) if query: response = self.client.count(q=query, **self.arguments) else: response = self.client.count(**self.arguments) return (response["count"], len(columns) * 100) except Exception as exception: log2pg( "COUNT for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return (0, 0)
def execute(self, quals, columns): """ Execute the query """ try: query = self._get_query(quals) if query: response = self.client.search(index=self.index, doc_type=self.doc_type, q=query) else: response = self.client.search(index=self.index, doc_type=self.doc_type) return self._convert_response(response, columns, query) except Exception as exception: log2pg( "SEARCH for /{index}/{doc_type} failed: {exception}".format( index=self.index, doc_type=self.doc_type, exception=exception ), logging.ERROR, ) return (0, 0)
def delete(self, document_id): """ Delete documents from Elastic Search """ if self.complete_returning: document = self._read_by_id(document_id) else: document = {self.rowid_column: document_id} try: self.client.delete(id=document_id, refresh=self.refresh, **self.arguments) return document except Exception as exception: log2pg( "DELETE for {path}/{document_id} failed: {exception}".format( path=self.path, document_id=document_id, exception=exception ), logging.ERROR, ) return (0, 0)
def update(self, document_id, new_values): """ Update existing documents in Elastic Search """ new_values.pop(self.rowid_column, None) try: response = self.client.index(index=self.index, doc_type=self.doc_type, id=document_id, body=new_values) return response except Exception as exception: log2pg( "INDEX for /{index}/{doc_type}/{document_id} and document {document} failed: {exception}" .format(index=self.index, doc_type=self.doc_type, document_id=document_id, document=new_values, exception=exception), logging.ERROR) return (0, 0)
def update(self, document_id, new_values): """ Update existing documents in Elastic Search """ new_values.pop(self.rowid_column, None) try: response = self.client.index(index=self.index, doc_type=self.doc_type, id=document_id, body=new_values) return response except Exception as exception: log2pg( "INDEX for /{index}/{doc_type}/{document_id} and document {document} failed: {exception}".format( index=self.index, doc_type=self.doc_type, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0)
def get_rel_size(self, quals, columns): """ Helps the planner by returning costs. Returns a tuple of the form (number of rows, average row width) """ try: query = self._get_query(quals) if query: response = self.client.count(index=self.index, doc_type=self.doc_type, q=query) else: response = self.client.count(index=self.index, doc_type=self.doc_type) return (response["count"], len(columns) * 100) except Exception as exception: log2pg( "COUNT for /{index}/{doc_type} failed: {exception}".format( index=self.index, doc_type=self.doc_type, exception=exception ), logging.ERROR, ) return (0, 0)
def build_spec(self, quals, trans=True): Q = {} comp_mapper = { '=': '$eq', '>': '$gt', '>=': '$gte', '<=': '$lte', '<>': '$ne', '<': '$lt', (u'=', True): '$in', (u'<>', False): '$nin', '~~': '$regex' } # TODO '!~~', '~~*', '!~~*', other binary ones that are composable for qual in quals: val_formatter = self.fields[qual.field_name]['formatter'] vform = lambda val: val_formatter( val) if val is not None and val_formatter is not None else val if self.debug: log2pg('vform {} val_formatter: {} '.format( vform, val_formatter)) if trans and 'options' in self.fields[ qual.field_name] and 'mname' in self.fields[ qual.field_name]['options']: mongo_field_name = self.fields[ qual.field_name]['options']['mname'] else: mongo_field_name = qual.field_name if self.debug: log2pg('Qual field_name: {} operator: {} value: {}'.format( mongo_field_name, qual.operator, qual.value)) if qual.operator in comp_mapper: comp = Q.setdefault(mongo_field_name, {}) if qual.operator == '~~': comp[comp_mapper[qual.operator]] = vform( qual.value.replace('%', '.*')) else: comp[comp_mapper[qual.operator]] = vform(qual.value) Q[mongo_field_name] = comp if self.debug: log2pg('Qual {} comp {}'.format(qual.operator, qual.value)) else: log2pg('Qual operator {} not implemented for value {}'.format( qual.operator, qual.value)) return Q
def plan(self, quals, columns): # Base pipeline pipe = [] if self.pipe: pipe.extend(self.pipe) # Project (rename fields) fields = dict([(k, True) for k in columns]) projectFields = {} for f in fields: if 'options' in self.fields[f] and 'mname' in self.fields[f][ 'options']: projectFields[f] = '$' + self.fields[f]['options']['mname'] else: projectFields[f] = fields[f] if len(projectFields) > 0: pipe.append({"$project": projectFields}) if self.debug: log2pg('projectFields: {}'.format(projectFields)) # Match Q = self.build_spec(quals) if Q: pipe.append({"$match": Q}) if self.debug: log2pg('mathcFields: {}'.format(Q)) # optimization 1: if columns include field(s) with equality predicate in query, # then we don't have to fetch it, as we add them back later eqfields = dict([(q.field_name, q.value) for q in quals if q.operator == '=']) for f in eqfields: fields.pop(f) if len(fields) == 0: # optimization 2: no fields need to be returned, just get counts pipe.append({"$count": "rows"}) elif len(eqfields) > 0: # remove constant fields, that get added back later pipe.append({"$project": fields}) # push-down filters through user supplied pipeline pipe = self.optimize(pipe) return (fields, eqfields, pipe)
def update(self, document_id, new_values): """ Update existing documents in Elastic Search """ new_values.pop(self.rowid_column, None) try: response = self.client.index( id=document_id, body=new_values, **self.arguments ) return response except Exception as exception: log2pg( "INDEX for {path}/{document_id} and document {document} failed: {exception}".format( path=self.path, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0)
def build_spec(self, quals): Q = {} comp_mapper = {'>': '$gt', '>=': '$gte', '<=': '$lte', '<': '$lt'} for qual in quals: val_formatter = self.fields[qual.field_name]['formatter'] vform = lambda val: val_formatter(val) if val_formatter is not None else val if qual.operator == '=': Q[qual.field_name] = vform(qual.value) elif qual.operator in ('>', '>=', '<=', '<'): comp = Q.setdefault(qual.field_name, {}) comp[comp_mapper[qual.operator]] = vform(qual.value) Q[qual.field_name] = comp else: log2pg('Qual operator {} not implemented yet: {}'.format(qual.field_name, qual)) return Q
def get_rel_size(self, quals, columns): """ Helps the planner by returning costs. Returns a tuple of the form (number of rows, average row width) """ try: query = self._get_query(quals) if query: response = self.client.count(index=self.index, doc_type=self.doc_type, q=query) else: response = self.client.count(index=self.index, doc_type=self.doc_type) return (response['count'], len(columns) * 100) except Exception as exception: log2pg( "COUNT for /{index}/{doc_type} failed: {exception}".format( index=self.index, doc_type=self.doc_type, exception=exception), logging.ERROR) return (0, 0)
def execute(self, quals, columns): """ Execute the query """ try: query = self._get_query(quals) q_dict = json.loads(query.encode('utf-8')) pg_id = self._get_pg_id(quals) response = self.client.search(body=q_dict, index=self.index, size=self.size, explain=self.explain) while True: for result in response["hits"]["hits"]: yield self._format_out(result, pg_id=pg_id, query=query) return except Exception as exception: log2pg( "SEARCH for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return
def insert(self, new_values): """ Publish a new / updated / deleted document into RabbitMQ """ log2pg('MARK Request - new values: %s' % new_values, logging.DEBUG) if not 'table' in new_values: log2pg('It requires "table" column. Missing in: %s' % new_values, logging.ERROR) if not 'id' in new_values: log2pg('It requires "id" column. Missing in: %s' % new_values, logging.ERROR) if not 'action' in new_values: log2pg('It requires "action" column. Missing in: %s' % new_values, logging.ERROR) return self.rabbitmq_publish(new_values)
def build_spec(self, quals, trans=True): Q = {} comp_mapper = {'=' : '$eq', '>' : '$gt', '>=': '$gte', '<=': '$lte', '<>': '$ne', '<' : '$lt', (u'=', True) : '$in', (u'<>', False) : '$nin', '~~': '$regex' } # TODO '!~~', '~~*', '!~~*', other binary ones that are composable for qual in quals: val_formatter = self.fields[qual.field_name]['formatter'] vform = lambda val: val_formatter(val) if val is not None and val_formatter is not None else val if self.debug: log2pg('vform {} val_formatter: {} '.format(vform, val_formatter)) if trans and 'options' in self.fields[qual.field_name] and 'mname' in self.fields[qual.field_name]['options']: mongo_field_name=self.fields[qual.field_name]['options']['mname'] else: mongo_field_name=qual.field_name if self.debug: log2pg('Qual field_name: {} operator: {} value: {}'.format(mongo_field_name, qual.operator, qual.value)) if qual.operator in comp_mapper: comp = Q.setdefault(mongo_field_name, {}) if qual.operator == '~~': comp[comp_mapper[qual.operator]] = vform(qual.value.strip('%').replace('%','.*').replace('_','.')) else: comp[comp_mapper[qual.operator]] = vform(qual.value) Q[mongo_field_name] = comp if self.debug: log2pg('Qual {} comp {}'.format(qual.operator, comp[comp_mapper[qual.operator]])) else: log2pg('Qual operator {} not implemented for value {}'.format(qual.operator, qual.value)) return Q
def execute(self, quals, columns, sortkeys=None): fields, eqfields, pipe = self.plan(quals, columns) if self.debug: t0 = time.time() if self.debug: log2pg('Calling aggregate with {} stage pipe {} '.format( len(pipe), pipe)) cur = self.coll.aggregate(pipe, cursor={}) if self.debug: t1 = time.time() if self.debug: docCount = 0 if self.debug: log2pg('cur is returned {} with total {} so far'.format( cur, t1 - t0)) if len(fields) == 0: for res in cur: docCount = res['rows'] break for x in xrange(docCount): if eqfields: yield eqfields else: yield {} else: for doc in cur: doc = dict([(col, dict_traverser(self.fields[col]['path'], doc)) for col in columns]) doc.update(eqfields) yield doc if self.debug: docCount = docCount + 1 if self.debug: t2 = time.time() if self.debug: log2pg('Python rows {} Python_duration {} {} {}ms'.format( docCount, (t1 - t0) * 1000, (t2 - t1) * 1000, (t2 - t0) * 1000))
def execute(self, quals, columns): """ Should Execute the query but we don't handle it (for now?) """ log2pg("SELECT isn't implemented for RabbitMQ", logging.ERROR) yield {0, 0}
def execute(self, quals, columns, d={}): if self.debug: t0 = time.time() ## Only request fields of interest: fields = dict([(k, True) for k in columns]) Q = self.build_spec(quals) # optimization: if columns include field(s) with equality predicate in query, then we don't have to fetch it eqfields = dict([ (q.field_name , q.value) for q in quals if q.operator == '=' ]) for f in eqfields: fields.pop(f) # instead we will inject the exact equality expression into the result set if len(fields)==0: # no fields need to be returned, just get counts if not self.pipe: docCount = self.coll.find(Q).count() else: # there's a pipe with unwind arr=self.pipe[0]['$unwind'] # may not be safe assumption in the future countpipe=[] if Q: countpipe.append({'$match':Q}) # hack: everyone just gets array size, # TODO: this only works for one $unwind for now countpipe.append({'$project':{'_id':0, 'arrsize': {'$size':arr}}}) countpipe.append({'$group':{'_id':None,'sum':{'$sum':'$arrsize'}}}) cur = self.coll.aggregate(countpipe, cursor={}) for res in cur: docCount=res['sum'] break for x in xrange(docCount): if eqfields: yield eqfields else: yield d # we are done if self.debug: t1 = time.time() else: # we have one or more fields requested, with or without pipe if '_id' not in fields: fields['_id'] = False if self.debug: log2pg('fields: {}'.format(columns)) if self.debug: log2pg('fields: {}'.format(fields)) pipe = [] projectFields={} transkeys = [k for k in self.fields.keys() if 'mname' in self.fields[k].get('options',{})] transfields = set(fields.keys()) & set(transkeys) if self.debug: log2pg('transfields {} fieldskeys {} transkeys {}'.format(transfields,fields.keys(),transkeys)) for f in fields: # there are some fields wanted returned which must be transformed if self.debug: log2pg('f {} hasoptions {} self.field[f] {}'.format(f,'options' in self.fields[f],self.fields[f])) if 'options' in self.fields[f] and 'mname' in self.fields[f]['options']: if self.debug: log2pg('self field {} options {}'.format(f,self.fields[f]['options']['mname'])) projectFields[f]='$'+self.fields[f]['options']['mname'] else: projectFields[f]=fields[f] if self.debug: log2pg('projectFields: {}'.format(projectFields)) # if there was field transformation we have to use the pipeline if self.pipe or transfields: if self.pipe: pipe.extend(self.pipe) if Q: pipe.insert(0, { "$match" : Q } ) pipe.append( { "$project" : projectFields } ) if transfields and Q: # only needed if quals fields are array members, can check that TODO postQ= self.build_spec(quals, False) if Q != postQ: pipe.append( { "$match" : postQ } ) if self.debug: log2pg('Calling aggregate with {} stage pipe {} '.format(len(pipe),pipe)) cur = self.coll.aggregate(pipe, cursor={}) else: if self.debug: log2pg('Calling find') cur = self.coll.find(Q, fields) if self.debug: t1 = time.time() if self.debug: docCount=0 if self.debug: log2pg('cur is returned {} with total {} so far'.format(cur,t1-t0)) for doc in cur: doc.update(eqfields) yield dict([(col, dict_traverser(self.fields[col]['path'], doc)) for col in columns]) if self.debug: docCount=docCount+1 if self.debug: t2 = time.time() if self.debug: log2pg('Python rows {} Python_duration {} {} {}ms'.format(docCount,(t1-t0)*1000,(t2-t1)*1000,(t2-t0)*1000))
def __init__(self, options, columns): super(Yamfdw, self).__init__(options, columns) self.host_name = options.get('host', 'localhost') self.port = int(options.get('port', '27017')) self.user = options.get('user') self.password = options.get('password') self.db_name = options.get('db', 'test') self.collection_name = options.get('collection', 'test') self.conn = MongoClient(host=self.host_name, port=self.port) self.auth_db = options.get('auth_db', self.db_name) if self.user: self.conn.userprofile.authenticate(self.user, self.password, source=self.auth_db) self.db = getattr(self.conn, self.db_name) self.coll = getattr(self.db, self.collection_name) self.debug = options.get('debug', False) # if we need to validate or transform any fields this is a place to do it # we need column definitions for types to validate we're passing back correct types # self.db.add_son_manipulator(Transform(columns)) if self.debug: log2pg('collection cols: {}'.format(columns)) self.stats = self.db.command("collstats", self.collection_name) self.count=self.stats["count"] if self.debug: log2pg('self.stats: {} '.format(self.stats)) self.indexes={} if self.stats["nindexes"]>1: indexdict = self.coll.index_information() if sys.version_info[0] < 3: self.indexes = dict([(idesc['key'][0][0], idesc.get('unique',False)) for iname, idesc in indexdict.iteritems()]) else: self.indexes = dict([(idesc['key'][0][0], idesc.get('unique',False)) for iname, idesc in indexdict.items()]) if self.debug: log2pg('self.indexes: {} '.format(self.indexes)) self.fields = dict([(col, {'formatter': coltype_formatter(coldef.type_name, coldef.options.get('type',None)), 'options': coldef.options, 'path': col.split('.')}) for (col, coldef) in columns.items()]) if self.debug: log2pg('self.fields: {} \n columns.items {}'.format(self.fields,columns.items())) self.pipe = options.get('pipe') if self.pipe: self.pipe = json.loads(self.pipe) if self.debug: log2pg('pipe is {}'.format(self.pipe)) else: self.pkeys = [ (('_id',), 1), ] for f in self.fields: # calculate selectivity of each field (once per session) if f=='_id': continue # check for unique indexes and set those to 1 if f in self.indexes and self.indexes.get(f): self.pkeys.append( ((f,), 1) ) elif f in self.indexes: self.pkeys.append( ((f,), min((self.count/10),1000) ) ) else: self.pkeys.append( ((f,), self.count) )
def execute(self, quals, columns, d={}): if self.debug: t0 = time.time() ## Only request fields of interest: fields = dict([(k, True) for k in columns]) Q = self.build_spec(quals) # optimization: if columns include field(s) with equality predicate in query, then we don't have to fetch it eqfields = dict([ (q.field_name , q.value) for q in quals if q.operator == '=' ]) for f in eqfields: fields.pop(f) # instead we will inject the exact equality expression into the result set if len(fields)==0: # no fields need to be returned, just get counts if not self.pipe: docCount = self.coll.find(Q).count() else: # there's a pipe with unwind arr=self.pipe[0]['$unwind'] # may not be safe assumption in the future countpipe=[] if Q: countpipe.append({'$match':Q}) # hack: everyone just gets array size, # TODO: this only works for one $unwind for now countpipe.append({'$project':{'_id':0, 'arrsize': {'$size':arr}}}) countpipe.append({'$group':{'_id':None,'sum':{'$sum':'$arrsize'}}}) cur = self.coll.aggregate(countpipe, cursor={}) for res in cur: docCount=res['sum'] break if sys.version_info[0] < 3: for x in xrange(docCount): if eqfields: yield eqfields else: yield d else: for x in range(docCount): if eqfields: yield eqfields else: yield d # we are done if self.debug: t1 = time.time() else: # we have one or more fields requested, with or without pipe if '_id' not in fields: fields['_id'] = False if self.debug: log2pg('fields: {}'.format(columns)) if self.debug: log2pg('fields: {}'.format(fields)) pipe = [] projectFields={} transkeys = [k for k in self.fields.keys() if 'mname' in self.fields[k].get('options',{})] transfields = set(fields.keys()) & set(transkeys) if self.debug: log2pg('transfields {} fieldskeys {} transkeys {}'.format(transfields,fields.keys(),transkeys)) for f in fields: # there are some fields wanted returned which must be transformed if self.debug: log2pg('f {} hasoptions {} self.field[f] {}'.format(f,'options' in self.fields[f],self.fields[f])) if 'options' in self.fields[f] and 'mname' in self.fields[f]['options']: if self.debug: log2pg('self field {} options {}'.format(f,self.fields[f]['options']['mname'])) projectFields[f]='$'+self.fields[f]['options']['mname'] else: projectFields[f]=fields[f] if self.debug: log2pg('projectFields: {}'.format(projectFields)) # if there was field transformation we have to use the pipeline if self.pipe or transfields: if self.pipe: pipe.extend(self.pipe) if Q: pipe.insert(0, { "$match" : Q } ) pipe.append( { "$project" : projectFields } ) if transfields and Q: # only needed if quals fields are array members, can check that TODO postQ= self.build_spec(quals, False) if Q != postQ: pipe.append( { "$match" : postQ } ) if self.debug: log2pg('Calling aggregate with {} stage pipe {} '.format(len(pipe),pipe)) cur = self.coll.aggregate(pipe, cursor={}) else: if self.debug: log2pg('Calling find') cur = self.coll.find(Q, fields) if self.debug: t1 = time.time() if self.debug: docCount=0 if self.debug: log2pg('cur is returned {} with total {} so far'.format(cur,t1-t0)) for doc in cur: doc.update(eqfields) yield dict([(col, dict_traverser(self.fields[col]['path'], doc)) for col in columns]) if self.debug: docCount=docCount+1 if self.debug: t2 = time.time() if self.debug: log2pg('Python rows {} Python_duration {} {} {}ms'.format(docCount,(t1-t0)*1000,(t2-t1)*1000,(t2-t0)*1000))
def __init__(self, options, columns): super(Yamfdw, self).__init__(options, columns) self.host_name = options.get('host', 'localhost') self.port = int(options.get('port', '27017')) self.user = options.get('user') self.password = options.get('password') self.db_name = options.get('db', 'test') self.collection_name = options.get('collection', 'test') self.conn = MongoClient(host=self.host_name, port=self.port) self.auth_db = options.get('auth_db', self.db_name) if self.user: self.conn.userprofile.authenticate(self.user, self.password, source=self.auth_db) self.db = getattr(self.conn, self.db_name) self.coll = getattr(self.db, self.collection_name) self.debug = options.get('debug', False) # if we need to validate or transform any fields this is a place to do it # we need column definitions for types to validate we're passing back correct types # self.db.add_son_manipulator(Transform(columns)) if self.debug: log2pg('collection cols: {}'.format(columns)) self.stats = self.db.command("collstats", self.collection_name) self.count=self.stats["count"] if self.debug: log2pg('self.stats: {} '.format(self.stats)) self.indexes={} if self.stats["nindexes"]>1: indexdict = self.coll.index_information() self.indexes = dict([(idesc['key'][0][0], idesc.get('unique',False)) for iname, idesc in indexdict.iteritems()]) if self.debug: log2pg('self.indexes: {} '.format(self.indexes)) self.fields = dict([(col, {'formatter': coltype_formatter(coldef.type_name, coldef.options.get('type',None)), 'options': coldef.options, 'path': col.split('.')}) for (col, coldef) in columns.items()]) if self.debug: log2pg('self.fields: {} \n columns.items {}'.format(self.fields,columns.items())) self.pipe = options.get('pipe') if self.pipe: self.pipe = json.loads(self.pipe) if self.debug: log2pg('pipe is {}'.format(self.pipe)) else: self.pkeys = [ (('_id',), 1), ] for f in self.fields: # calculate selectivity of each field (once per session) if f=='_id': continue # check for unique indexes and set those to 1 if f in self.indexes and self.indexes.get(f): self.pkeys.append( ((f,), 1) ) elif f in self.indexes: self.pkeys.append( ((f,), min((self.count/10),1000) ) ) else: self.pkeys.append( ((f,), self.count) )