def _element_to_bson(key, value, check_keys): if not isinstance(key, basestring): raise InvalidDocument("documents must have only string keys, " "key was %r" % key) if check_keys: if key.startswith("$"): raise InvalidDocument("key %r must not start with '$'" % key) if "." in key: raise InvalidDocument("key %r must not contain '.'" % key) name = _make_c_string(key, True) if isinstance(value, float): return "\x01" + name + struct.pack("<d", value) # Use Binary w/ subtype 3 for UUID instances try: import uuid if isinstance(value, uuid.UUID): value = Binary(value.bytes, subtype=3) except ImportError: pass if isinstance(value, Binary): subtype = value.subtype if subtype == 2: value = struct.pack("<i", len(value)) + value return "\x05%s%s%s%s" % (name, struct.pack("<i", len(value)), chr(subtype), value) if isinstance(value, Code): cstring = _make_c_string(value) scope = _dict_to_bson(value.scope, False, False) full_length = struct.pack("<i", 8 + len(cstring) + len(scope)) length = struct.pack("<i", len(cstring)) return "\x0F" + name + full_length + length + cstring + scope if isinstance(value, str): cstring = _make_c_string(value) length = struct.pack("<i", len(cstring)) return "\x02" + name + length + cstring if isinstance(value, unicode): cstring = _make_c_string(value) length = struct.pack("<i", len(cstring)) return "\x02" + name + length + cstring if isinstance(value, dict): return "\x03" + name + _dict_to_bson(value, check_keys, False) if isinstance(value, (list, tuple)): as_dict = SON(zip([str(i) for i in range(len(value))], value)) return "\x04" + name + _dict_to_bson(as_dict, check_keys, False) if isinstance(value, ObjectId): return "\x07" + name + value.binary if value is True: return "\x08" + name + "\x01" if value is False: return "\x08" + name + "\x00" if isinstance(value, int): # TODO this is an ugly way to check for this... if value > MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") if value > MAX_INT32 or value < MIN_INT32: return "\x12" + name + struct.pack("<q", value) return "\x10" + name + struct.pack("<i", value) if isinstance(value, long): # XXX No long type in Python 3 if value > MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") return "\x12" + name + struct.pack("<q", value) if isinstance(value, datetime.datetime): if value.utcoffset() is not None: value = value - value.utcoffset() millis = int(calendar.timegm(value.timetuple()) * 1000 + value.microsecond / 1000) return "\x09" + name + struct.pack("<q", millis) if isinstance(value, Timestamp): time = struct.pack("<I", value.time) inc = struct.pack("<I", value.inc) return "\x11" + name + inc + time if value is None: return "\x0A" + name if isinstance(value, RE_TYPE): pattern = value.pattern flags = "" if value.flags & re.IGNORECASE: flags += "i" if value.flags & re.LOCALE: flags += "l" if value.flags & re.MULTILINE: flags += "m" if value.flags & re.DOTALL: flags += "s" if value.flags & re.UNICODE: flags += "u" if value.flags & re.VERBOSE: flags += "x" return "\x0B" + name + _make_c_string(pattern, True) + \ _make_c_string(flags) if isinstance(value, DBRef): return _element_to_bson(key, value.as_doc(), False) if isinstance(value, MinKey): return "\xFF" + name if isinstance(value, MaxKey): return "\x7F" + name raise InvalidDocument("cannot convert value of type %s to bson" % type(value))
def save(self): data = SON() data.update(self._data) self.db().insert(data)
def charges(): client = pymongo.MongoClient(os.environ['MONGO_URI']) db = client.va_circuit_court charges = db.criminal_cases.aggregate([{ '$group': { '_id': { 'CodeSection': '$CodeSection', 'Race': '$Race' }, 'charge': { '$first': '$Charge' }, 'court': { '$first': '$Court' }, 'caseNumber': { '$first': '$CaseNumber' }, 'avgSentence': { '$avg': '$SentenceTimeDays' }, 'avgSentenceSuspended': { '$avg': '$SentenceSuspendedDays' }, 'count': { '$sum': 1 } } }, { '$group': { '_id': { 'CodeSection': '$_id.CodeSection' }, 'races': { '$push': { 'race': '$_id.Race', 'avgSentence': '$avgSentence', 'avgSentenceSuspended': '$avgSentenceSuspended', 'count': '$count' } }, 'count': { '$sum': '$count' }, 'avgSentence': { '$avg': '$avgSentence' }, 'avgSentenceSuspended': { '$avg': '$avgSentenceSuspended' }, 'charge': { '$first': '$charge' }, 'court': { '$first': '$court' }, 'caseNumber': { '$first': '$caseNumber' } } }, { '$match': { 'count': { '$gt': 50 } } }, { '$sort': SON([('_id.CodeSection', 1)]) }])['result'] charges_amended = db.criminal_cases.aggregate([{ '$match': { 'AmendedCharge': { '$ne': None } } }, { '$group': { '_id': { 'CodeSection': '$CodeSection', 'Race': '$Race' }, 'charge': { '$first': '$Charge' }, 'court': { '$first': '$Court' }, 'caseNumber': { '$first': '$CaseNumber' }, 'avgSentence': { '$avg': '$SentenceTimeDays' }, 'avgSentenceSuspended': { '$avg': '$SentenceSuspendedDays' }, 'count': { '$sum': 1 } } }, { '$group': { '_id': { 'CodeSection': '$_id.CodeSection' }, 'races': { '$push': { 'race': '$_id.Race', 'avgSentence': '$avgSentence', 'avgSentenceSuspended': '$avgSentenceSuspended', 'count': '$count' } }, 'count': { '$sum': '$count' }, 'avgSentence': { '$avg': '$avgSentence' }, 'avgSentenceSuspended': { '$avg': '$avgSentenceSuspended' }, 'charge': { '$first': '$charge' }, 'court': { '$first': '$court' }, 'caseNumber': { '$first': '$caseNumber' } } }, { '$sort': SON([('_id.CodeSection', 1)]) }])['result'] for charge in charges: charge['amended'] = { 'count': 0, 'avgSentence': 0, 'avgSentenceSuspended': 0, 'races': [] } for charge_amended in charges_amended: if charge_amended['_id']['CodeSection'] == charge['_id'][ 'CodeSection']: charge['amended'] = charge_amended break charge['races_dict'] = { 'White Caucasian (Non-Hispanic)': { 'count': 0, 'avgSentence': 0, 'avgSentenceSuspended': 0 }, 'Black (Non-Hispanic)': { 'count': 0, 'avgSentence': 0, 'avgSentenceSuspended': 0 } } charge['amended']['races_dict'] = { 'White Caucasian (Non-Hispanic)': { 'count': 0, 'avgSentence': 0, 'avgSentenceSuspended': 0 }, 'Black (Non-Hispanic)': { 'count': 0, 'avgSentence': 0, 'avgSentenceSuspended': 0 } } for race in charge['races']: if 'race' in race: charge['races_dict'][race['race']] = race for race in charge['amended']['races']: if 'race' in race: charge['amended']['races_dict'][race['race']] = race return render_template('charges.html', charges=charges, charges_amended=charges_amended)
def main(uri): client = MongoClient(uri) #connect to database db = client.get_default_database() #by Email: pipeline = [{ "$match": { "E-mail Address": { "$nin": ["null", "?"] }, } }, { "$group": { "_id": { "E-mail Address": "$E-mail Address" }, "uniqueIds": { "$addToSet": "$_id" }, "count": { "$sum": 1 } } }, { "$match": { "count": { "$gt": 1 } } }, { "$sort": SON([("count", -1), ("_id", -1)]) }] ''' #by First AND Last Name pipeline = [ {"$match":{"First Name":{"$nin":["null","?"]},"Last Name":{"$nin":["null","?"]}}}, {"$group":{"_id":{"First Name":"$First Name", "Last Name":"$Last Name"},"uniqueIds":{"$addToSet":"$_id"},"count": {"$sum": 1}}}, {"$match":{"count": {"$gt": 1}}}, {"$sort": SON([("count", -1), ("_id", -1)])} ] ''' #send data to list data = list(db.cleancontacts.aggregate(pipeline)) #db.command('aggregate', 'contacts', pipeline=pipeline, explain=True) #pprint.pprint(data) #put it in a json object json_string = dumps(data, json_options=RELAXED_JSON_OPTIONS) json_data = json.loads(json_string) #print(json_data) new_ids = [] num_iter = 0 error_count = 0 #iterate over json objects for contact in json_data: num_iter += 1 ids = contact["uniqueIds"] #print(ids) ids_to_merge = [] #iterate over id values in json object contact for id in ids: id_value = id["$oid"] #print(id_value) #send ids to an array ids_to_merge.append(id_value) #get the first object from the id array, send to json aggregated_contact_string = dumps(db.cleancontacts.find_one( {"_id": ObjectId(ids_to_merge[0])}), json_options=RELAXED_JSON_OPTIONS) aggregated_contact = json.loads(aggregated_contact_string) #pprint.pprint(aggregated_contact) for id in ids_to_merge[1:]: #get data from contacts collection, iterating by id over the array contact_data = db.cleancontacts.find_one({"_id": ObjectId(id)}) #pprint.pprint(contact_data) count = 0 for field in contact_data: #iterate over each field and append to json object aggregated_contact, in the correct field, if necessary try: #print(field, aggregated_contact[field], contact_data[field]) if aggregated_contact[field] == None or aggregated_contact[ field] == "null" or aggregated_contact[ field] == "" or bool( re.search(str(contact_data[field]), str(aggregated_contact[field]))): aggregated_contact[field] = contact_data[field] else: aggregated_contact[field] = str( aggregated_contact[field]) + ', ' + str( contact_data[field]) except: error_count += 1 #get rid of id fields in json object aggregated_contact.pop('_id', None) #pprint.pprint(aggregated_contact) #delete old contacts (now aggregated) for id in ids_to_merge: db.cleancontacts.delete_one({"_id": ObjectId(id)}) #send new json object to database post_id = db.cleancontacts.insert_one(aggregated_contact).inserted_id #print(post_id) new_ids.append(str(post_id)) #db.cleancontacts.delete_one({"_id":ObjectId(post_id)}) #print out information about post if num_iter % 10 == 0: print("Created " + str(num_iter) + " aggregates from the duplicates.") print("Deleted " + str(num_iter) + " duplicate contacts, replaced with " + str(len(new_ids)) + " new aggregates. IDs: " + ' '.join(str(value) for value in new_ids) + "key errors: " + str(error_count))
def run_query(self, query, user): db = self._get_db() logger.debug( "mongodb connection string: %s", self.configuration["connectionString"] ) logger.debug("mongodb got query: %s", query) try: query_data = parse_query_json(query) except ValueError: return None, "Invalid query format. The query is not a valid JSON." if "collection" not in query_data: return None, "'collection' must have a value to run a query" else: collection = query_data["collection"] q = query_data.get("query", None) f = None aggregate = query_data.get("aggregate", None) if aggregate: for step in aggregate: if "$sort" in step: sort_list = [] for sort_item in step["$sort"]: sort_list.append((sort_item["name"], sort_item["direction"])) step["$sort"] = SON(sort_list) if "fields" in query_data: f = query_data["fields"] s = None if "sort" in query_data and query_data["sort"]: s = [] for field_data in query_data["sort"]: s.append((field_data["name"], field_data["direction"])) columns = [] rows = [] cursor = None if q or (not q and not aggregate): if s: cursor = db[collection].find(q, f).sort(s) else: cursor = db[collection].find(q, f) if "skip" in query_data: cursor = cursor.skip(query_data["skip"]) if "limit" in query_data: cursor = cursor.limit(query_data["limit"]) if "count" in query_data: cursor = cursor.count() elif aggregate: allow_disk_use = query_data.get("allowDiskUse", False) r = db[collection].aggregate(aggregate, allowDiskUse=allow_disk_use) # Backwards compatibility with older pymongo versions. # # Older pymongo version would return a dictionary from an aggregate command. # The dict would contain a "result" key which would hold the cursor. # Newer ones return pymongo.command_cursor.CommandCursor. if isinstance(r, dict): cursor = r["result"] else: cursor = r if "count" in query_data: columns.append( {"name": "count", "friendly_name": "count", "type": TYPE_INTEGER} ) rows.append({"count": cursor}) else: rows, columns = parse_results(cursor) if f: ordered_columns = [] for k in sorted(f, key=f.get): column = _get_column_by_name(columns, k) if column: ordered_columns.append(column) columns = ordered_columns if query_data.get("sortColumns"): reverse = query_data["sortColumns"] == "desc" columns = sorted(columns, key=lambda col: col["name"], reverse=reverse) data = {"columns": columns, "rows": rows} error = None json_data = json_dumps(data, cls=MongoDBJSONEncoder) return json_data, error
def _authenticate_gssapi(credentials, sock_info): """Authenticate using GSSAPI. """ if not HAVE_KERBEROS: raise ConfigurationError('The "kerberos" module must be ' 'installed to use GSSAPI authentication.') try: username = credentials.username password = credentials.password props = credentials.mechanism_properties # Starting here and continuing through the while loop below - establish # the security context. See RFC 4752, Section 3.1, first paragraph. host = sock_info.address[0] if props.canonicalize_host_name: host = _canonicalize_hostname(host) service = props.service_name + '@' + host if props.service_realm is not None: service = service + '@' + props.service_realm if password is not None: if _USE_PRINCIPAL: # Note that, though we use unquote_plus for unquoting URI # options, we use quote here. Microsoft's UrlUnescape (used # by WinKerberos) doesn't support +. principal = ":".join((quote(username), quote(password))) result, ctx = kerberos.authGSSClientInit( service, principal, gssflags=kerberos.GSS_C_MUTUAL_FLAG) else: if '@' in username: user, domain = username.split('@', 1) else: user, domain = username, None result, ctx = kerberos.authGSSClientInit( service, gssflags=kerberos.GSS_C_MUTUAL_FLAG, user=user, domain=domain, password=password) else: result, ctx = kerberos.authGSSClientInit( service, gssflags=kerberos.GSS_C_MUTUAL_FLAG) if result != kerberos.AUTH_GSS_COMPLETE: raise OperationFailure('Kerberos context failed to initialize.') try: # pykerberos uses a weird mix of exceptions and return values # to indicate errors. # 0 == continue, 1 == complete, -1 == error # Only authGSSClientStep can return 0. if kerberos.authGSSClientStep(ctx, '') != 0: raise OperationFailure('Unknown kerberos ' 'failure in step function.') # Start a SASL conversation with mongod/s # Note: pykerberos deals with base64 encoded byte strings. # Since mongo accepts base64 strings as the payload we don't # have to use bson.binary.Binary. payload = kerberos.authGSSClientResponse(ctx) cmd = SON([('saslStart', 1), ('mechanism', 'GSSAPI'), ('payload', payload), ('autoAuthorize', 1)]) response = sock_info.command('$external', cmd) # Limit how many times we loop to catch protocol / library issues for _ in range(10): result = kerberos.authGSSClientStep(ctx, str(response['payload'])) if result == -1: raise OperationFailure('Unknown kerberos ' 'failure in step function.') payload = kerberos.authGSSClientResponse(ctx) or '' cmd = SON([('saslContinue', 1), ('conversationId', response['conversationId']), ('payload', payload)]) response = sock_info.command('$external', cmd) if result == kerberos.AUTH_GSS_COMPLETE: break else: raise OperationFailure('Kerberos ' 'authentication failed to complete.') # Once the security context is established actually authenticate. # See RFC 4752, Section 3.1, last two paragraphs. if kerberos.authGSSClientUnwrap(ctx, str( response['payload'])) != 1: raise OperationFailure('Unknown kerberos ' 'failure during GSS_Unwrap step.') if kerberos.authGSSClientWrap( ctx, kerberos.authGSSClientResponse(ctx), username) != 1: raise OperationFailure('Unknown kerberos ' 'failure during GSS_Wrap step.') payload = kerberos.authGSSClientResponse(ctx) cmd = SON([('saslContinue', 1), ('conversationId', response['conversationId']), ('payload', payload)]) sock_info.command('$external', cmd) finally: kerberos.authGSSClientClean(ctx) except kerberos.KrbError as exc: raise OperationFailure(str(exc))
def searchMongoAlerts(mozdefdb): attackers = mozdefdb['attackers'] alerts = mozdefdb['alerts'] # search the last X alerts for IP addresses # aggregated by CIDR mask/24 # aggregate IPv4 addresses in the most recent alerts # to find common attackers. ipv4TopHits = alerts.aggregate([ { "$sort": { "utcepoch": -1 } }, # reverse sort the current alerts { "$limit": 100 }, #most recent 100 { "$match": { "events.documentsource.details.sourceipaddress": { "$exists": True } } }, # must have an ip address { "$match": { "attackerid": { "$exists": False } } }, # must not be already related to an attacker { "$group": { "_id": { "ipaddress": "$events.documentsource.details.sourceipaddress" } } }, # grab ip address from the events { "$unwind": "$_id.ipaddress" }, # separate all ips from their alerts { "$group": { "_id": "$_id.ipaddress", "hitcount": { "$sum": 1 } } }, # count by ip { "$match": { "hitcount": { "$gt": 10 } } }, # limit to those with 10 observances { "$sort": SON([("hitcount", -1), ("_id", -1)]) }, # sort { "$limit": 10 } # top 10 ]) for ip in ipv4TopHits['result']: if netaddr.valid_ipv4(ip['_id']): ipcidr = netaddr.IPNetwork(ip['_id']) # expand it to a /24 CIDR # todo: lookup ipwhois for asn_cidr value # potentially with a max mask value (i.e. asn is /8, limit attackers to /24) ipcidr.prefixlen = 24 # append to or create attacker. # does this match an existing attacker's indicators if not ipcidr.ip.is_loopback() and not ipcidr.ip.is_private( ) and not ipcidr.ip.is_reserved(): logger.debug('searching for alert ip ' + str(ipcidr)) attacker = attackers.find_one( {'indicators.ipv4address': str(ipcidr)}) if attacker is None: # new attacker # generate a meteor-compatible ID # save the ES document type, index, id # and add a sub list for future events logger.debug('new attacker from alerts') newAttacker = genNewAttacker() # str to get the ip/cidr rather than netblock cidr. # i.e. '1.2.3.4/24' not '1.2.3.0/24' newAttacker['indicators'].append( dict(ipv4address=str(ipcidr))) matchingalerts = alerts.find({ "events.documentsource.details.sourceipaddress": str(ipcidr.ip), }) if matchingalerts is not None: # update list of alerts this attacker matched. for alert in matchingalerts: newAttacker['alerts'].append( dict(alertid=alert['_id'])) # update alert with attackerID alert['attackerid'] = newAttacker['_id'] alerts.save(alert) #add the events from this alert: #add the events from this alert: for e in alert['events']: newAttacker['events'].append(e) newAttacker['alertscount'] = len(newAttacker['alerts']) newAttacker['eventscount'] = len(newAttacker['events']) if newAttacker['eventscount'] > 0: newAttacker['lastseentimestamp'] = toUTC( newAttacker['events'][-1]['documentsource'] ['utctimestamp'], 'UTC') attackers.insert(newAttacker) #upate geoIP info latestGeoIP = [ a['events'] for a in alerts.find({ "events.documentsource.details.sourceipaddress": str(ipcidr.ip), }) ][-1][0]['documentsource'] updateAttackerGeoIP(mozdefdb, newAttacker['_id'], latestGeoIP) else: logger.debug('found existing attacker in alerts') # if alert not present in this attackers list # append this to the list # todo: trim the list at X (i.e. last 100) # search alerts without attackerid matchingalerts = alerts.find({ "events.documentsource.details.sourceipaddress": str(ipcidr.ip), "attackerid": { "$exists": False } }) if matchingalerts is not None: #attacker['eventscount'] = len(attacker['events']) logger.debug('matched alert with attacker') # update list of alerts this attacker matched. for alert in matchingalerts: attacker['alerts'].append( dict(alertid=alert['_id'])) # update alert with attackerID alert['attackerid'] = attacker['_id'] alerts.save(alert) #add the events from this alert: for e in alert['events']: attacker['events'].append(e) # geo ip could have changed, update it # to the latest updateAttackerGeoIP( mozdefdb, attacker['_id'], alert['events'][-1]['documentsource']) # update last seen time attacker['lastseentimestamp'] = toUTC( attacker['events'][-1]['documentsource'] ['utctimestamp'], 'UTC') # update counts attacker['alertscount'] = len(attacker['alerts']) attacker['eventscount'] = len(attacker['events']) attackers.save(attacker)
def __query_spec(self): """Get the spec to use for a query. """ operators = {} if self.__ordering: operators["$orderby"] = self.__ordering if self.__explain: operators["$explain"] = True if self.__hint: operators["$hint"] = self.__hint if self.__snapshot: operators["$snapshot"] = True if self.__max_scan: operators["$maxScan"] = self.__max_scan if self.__collection.database.connection.is_mongos: read_pref = { 'mode': read_preferences.mongos_mode(self.__read_preference)} if self.__tag_sets and self.__tag_sets != [{}]: read_pref['tags'] = self.__tag_sets operators['$readPreference'] = read_pref if operators: # Make a shallow copy so we can cleanly rewind or clone. spec = self.__spec.copy() # Only commands that can be run on secondaries should have any # operators added to the spec. Command queries can be issued # by db.command or calling find_one on $cmd directly is_cmd = self.collection.name == "$cmd" if is_cmd: # Don't change commands that can't be sent to secondaries command_name = spec.keys()[0].lower() if command_name not in secondary_ok_commands: return spec elif command_name == 'mapreduce': # mapreduce shouldn't be changed if its not inline out = spec.get('out') if not isinstance(out, dict) or not out.get('inline'): return spec elif "$query" not in spec: # $query has to come first spec = SON({"$query": spec}) if not isinstance(spec, SON): # Ensure the spec is SON. As order is important this will # ensure its set before merging in any extra operators. spec = SON(spec) spec.update(operators) return spec # Have to wrap with $query if "query" is the first key. # We can't just use $query anytime "query" is a key as # that breaks commands like count and find_and_modify. # Checking spec.keys()[0] covers the case that the spec # was passed as an instance of SON or OrderedDict. elif ("query" in self.__spec and (len(self.__spec) == 1 or self.__spec.keys()[0] == "query")): return SON({"$query": self.__spec}) return self.__spec
def test_create(): cmd = parse_spec(SON([("create", "foo")])) assert cmd.name == "create" assert cmd.coll == "foo" assert cmd.tags == {} assert cmd.metrics == {}
6: "$hashNum", 7: "$date" } pipe = [{ "$match": { "user": user_num } }, { "$group": { "_id": metric_dict[metric], "count": { "$sum": 1 } } }, { "$sort": SON([("_id", -1)]) }] result = db.command('aggregate', 'calls', pipeline=pipe) bins = {} for each_bin in result['result']: bins[each_bin['_id']] = each_bin['count'] print bins if metric is 7: times = bins.keys() if not times: pass elif group_time is 0: # group by hour plt.hist([t.hour for t in times], bins=24) # to bin by hour plt.title('Histogram of call times by hour for user #{num}'.format(
def __init__(self, args, ver): ''' Получение атрибутов, необходимых заточенной под многопроцессовое выполнение функции разбиения коллекций по хромосомам. Атрибуты ни в коем случае не должны будут потом в параллельных процессах изменяться. Получаются они в основном из указанных исследователем аргументов. Некоторые неочевидные, но важные детали об атрибутах. Квази-расширение коллекций. Оно нужно, как минимум, для определения правил сортировки и форматирования конечных файлов. Сортировка src-db-VCF и src-db-BED. Она делается по координатам для обеспечения поддержки tabix-индексации конечных таблиц. Проджекшен (отбор полей). Для src-db-VCF его крайне трудно реализовать из-за наличия в соответствующих коллекциях разнообразных вложенных структур и запрета со стороны MongoDB на применение точечной формы обращения к отбираемым элементам массивов. Что касается src-db-BED, когда мы оставляем только часть полей, невозможно гарантировать соблюдение спецификаций BED-формата, поэтому вывод будет формироваться не более, чем просто табулированным (trg-(db-)TSV). ''' client = MongoClient() self.src_db_name = args.src_db_name self.src_coll_names = client[self.src_db_name].list_collection_names() src_coll_ext = self.src_coll_names[0].rsplit('.', maxsplit=1)[1] if '/' in args.trg_place: self.trg_dir_path = os.path.normpath(args.trg_place) elif args.trg_place != self.src_db_name: self.trg_db_name = args.trg_place resolve_db_existence(self.trg_db_name) else: raise DbAlreadyExistsError() if src_coll_ext == 'vcf': self.chrom_field_name = '#CHROM' elif src_coll_ext == 'bed': self.chrom_field_name = 'chrom' elif args.chrom_field_name is None: self.chrom_field_name = list(client[self.src_db_name][self.src_coll_names[0]].find_one())[1] else: self.chrom_field_name = args.chrom_field_name self.mongo_aggr_draft = [{'$match': {self.chrom_field_name: None}}] if src_coll_ext == 'vcf': self.mongo_aggr_draft.append({'$sort': SON([('#CHROM', ASCENDING), ('POS', ASCENDING)])}) elif src_coll_ext == 'bed': self.mongo_aggr_draft.append({'$sort': SON([('chrom', ASCENDING), ('start', ASCENDING), ('end', ASCENDING)])}) if args.proj_fields is None or src_coll_ext == 'vcf': self.mongo_findone_args = [None, None] self.trg_file_fmt = src_coll_ext else: mongo_project = {field_name: 1 for field_name in args.proj_fields.split(',')} self.mongo_aggr_draft.append({'$project': mongo_project}) self.mongo_findone_args = [None, mongo_project] self.trg_file_fmt = 'tsv' if args.sec_delimiter == 'colon': self.sec_delimiter = ':' elif args.sec_delimiter == 'comma': self.sec_delimiter = ',' elif args.sec_delimiter == 'low_line': self.sec_delimiter = '_' elif args.sec_delimiter == 'pipe': self.sec_delimiter = '|' elif args.sec_delimiter == 'semicolon': self.sec_delimiter = ';' if args.ind_field_names is None: self.ind_field_names = args.ind_field_names else: self.ind_field_names = args.ind_field_names.split(',') self.ver = ver client.close()
del df['_id'] return df def upload3(id, j): step0 = time.time() frames = [] for i in range(1,50): frames.append(read_mongo(sensors,{"id":id},page_num=i)) step1 = time.time() df = pd.concat(frames) print j, ": Data loaded... (%ss)" % (round((step1 - step0), 1)) return df pipeline = [ { "$match": { "id": int(id), "ts": {"$gt": install_date } } }, { "$sort" : SON([("ts", 1)]) } ] sensor_data = sensors.aggregate(pipeline, allowDiskUse = True) def read_mongo(collection, chunksize = 1000, page_num=1, no_id=True): # Calculate number of documents to skip skips = chunksize * (page_num - 1) print skips # Sorry, this is in spanish # https://www.toptal.com/python/c%C3%B3digo-buggy-python-los-10-errores-m%C3%A1s-comunes-que-cometen-los-desarrolladores-python/es # Make a query to the specific DB and Collection pipeline = [ { "$match": { "id": 209, "ts": {"$gt": install_date } } }, { "$skip" : skips}, { "$limit": chunksize },
def _execute_command( self, generator, write_concern, session, sock_info, op_id, retryable, full_result, final_write_concern=None, ): db_name = self.collection.database.name client = self.collection.database.client listeners = client._event_listeners if not self.current_run: self.current_run = next(generator) self.next_run = None run = self.current_run # sock_info.command validates the session, but we use # sock_info.write_command. sock_info.validate_session(client, session) last_run = False while run: if not self.retrying: self.next_run = next(generator, None) if self.next_run is None: last_run = True cmd_name = _COMMANDS[run.op_type] bwc = self.bulk_ctx_class( db_name, cmd_name, sock_info, op_id, listeners, session, run.op_type, self.collection.codec_options, ) while run.idx_offset < len(run.ops): # If this is the last possible operation, use the # final write concern. if last_run and (len(run.ops) - run.idx_offset) == 1: write_concern = final_write_concern or write_concern cmd = SON([(cmd_name, self.collection.name), ("ordered", self.ordered)]) if self.comment: cmd["comment"] = self.comment if not write_concern.is_server_default: cmd["writeConcern"] = write_concern.document if self.bypass_doc_val: cmd["bypassDocumentValidation"] = True if self.let is not None and run.op_type in (_DELETE, _UPDATE): cmd["let"] = self.let if session: # Start a new retryable write unless one was already # started for this command. if retryable and not self.started_retryable_write: session._start_retryable_write() self.started_retryable_write = True session._apply_to(cmd, retryable, ReadPreference.PRIMARY, sock_info) sock_info.send_cluster_time(cmd, session, client) sock_info.add_server_api(cmd) ops = islice(run.ops, run.idx_offset, None) # Run as many ops as possible in one command. if write_concern.acknowledged: result, to_send = bwc.execute(cmd, ops, client) # Retryable writeConcernErrors halt the execution of this run. wce = result.get("writeConcernError", {}) if wce.get("code", 0) in _RETRYABLE_ERROR_CODES: # Synthesize the full bulk result without modifying the # current one because this write operation may be retried. full = copy.deepcopy(full_result) _merge_command(run, full, run.idx_offset, result) _raise_bulk_write_error(full) _merge_command(run, full_result, run.idx_offset, result) # We're no longer in a retry once a command succeeds. self.retrying = False self.started_retryable_write = False if self.ordered and "writeErrors" in result: break else: to_send = bwc.execute_unack(cmd, ops, client) run.idx_offset += len(to_send) # We're supposed to continue if errors are # at the write concern level (e.g. wtimeout) if self.ordered and full_result["writeErrors"]: break # Reset our state self.current_run = run = self.next_run
def add_delete(self, selector, limit): """Create a delete document and add it to the list of ops. """ cmd = SON([('q', selector), ('limit', limit)]) self.ops.append((_DELETE, cmd))
async def test_query_array_of_documents(self): db = self.db # Start Example 29 # Subdocument key order matters in a few of these examples so we have # to use bson.son.SON instead of a Python dict. from bson.son import SON await db.inventory.insert_many([{ "item": "journal", "instock": [ SON([("warehouse", "A"), ("qty", 5)]), SON([("warehouse", "C"), ("qty", 15)]) ] }, { "item": "notebook", "instock": [SON([("warehouse", "C"), ("qty", 5)])] }, { "item": "paper", "instock": [ SON([("warehouse", "A"), ("qty", 60)]), SON([("warehouse", "B"), ("qty", 15)]) ] }, { "item": "planner", "instock": [ SON([("warehouse", "A"), ("qty", 40)]), SON([("warehouse", "B"), ("qty", 5)]) ] }, { "item": "postcard", "instock": [ SON([("warehouse", "B"), ("qty", 15)]), SON([("warehouse", "C"), ("qty", 35)]) ] }]) # End Example 29 # Start Example 30 cursor = db.inventory.find( {"instock": SON([("warehouse", "A"), ("qty", 5)])}) # End Example 30 self.assertEqual(await count(cursor), 1) # Start Example 31 cursor = db.inventory.find( {"instock": SON([("qty", 5), ("warehouse", "A")])}) # End Example 31 self.assertEqual(await count(cursor), 0) # Start Example 32 cursor = db.inventory.find({'instock.0.qty': {"$lte": 20}}) # End Example 32 self.assertEqual(await count(cursor), 3) # Start Example 33 cursor = db.inventory.find({'instock.qty': {"$lte": 20}}) # End Example 33 self.assertEqual(await count(cursor), 5) # Start Example 34 cursor = db.inventory.find( {"instock": { "$elemMatch": { "qty": 5, "warehouse": "A" } }}) # End Example 34 self.assertEqual(await count(cursor), 1) # Start Example 35 cursor = db.inventory.find( {"instock": { "$elemMatch": { "qty": { "$gt": 10, "$lte": 20 } } }}) # End Example 35 self.assertEqual(await count(cursor), 3) # Start Example 36 cursor = db.inventory.find({"instock.qty": {"$gt": 10, "$lte": 20}}) # End Example 36 self.assertEqual(await count(cursor), 4) # Start Example 37 cursor = db.inventory.find({ "instock.qty": 5, "instock.warehouse": "A" }) # End Example 37 self.assertEqual(await count(cursor), 2)
def test_empty(): cmd = parse_spec(SON([])) assert cmd is None
def _authenticate_scram(credentials, sock_info, mechanism): """Authenticate using SCRAM.""" username = credentials.username if mechanism == 'SCRAM-SHA-256': digest = "sha256" digestmod = hashlib.sha256 data = saslprep(credentials.password).encode("utf-8") else: digest = "sha1" digestmod = hashlib.sha1 data = _password_digest(username, credentials.password).encode("utf-8") source = credentials.source cache = credentials.cache # Make local _hmac = hmac.HMAC ctx = sock_info.auth_ctx.get(credentials) if ctx and ctx.speculate_succeeded(): nonce, first_bare = ctx.scram_data res = ctx.speculative_authenticate else: nonce, first_bare, cmd = _authenticate_scram_start( credentials, mechanism) res = sock_info.command(source, cmd) server_first = res['payload'] parsed = _parse_scram_response(server_first) iterations = int(parsed[b'i']) if iterations < 4096: raise OperationFailure("Server returned an invalid iteration count.") salt = parsed[b's'] rnonce = parsed[b'r'] if not rnonce.startswith(nonce): raise OperationFailure("Server returned an invalid nonce.") without_proof = b"c=biws,r=" + rnonce if cache.data: client_key, server_key, csalt, citerations = cache.data else: client_key, server_key, csalt, citerations = None, None, None, None # Salt and / or iterations could change for a number of different # reasons. Either changing invalidates the cache. if not client_key or salt != csalt or iterations != citerations: salted_pass = hashlib.pbkdf2_hmac(digest, data, standard_b64decode(salt), iterations) client_key = _hmac(salted_pass, b"Client Key", digestmod).digest() server_key = _hmac(salted_pass, b"Server Key", digestmod).digest() cache.data = (client_key, server_key, salt, iterations) stored_key = digestmod(client_key).digest() auth_msg = b",".join((first_bare, server_first, without_proof)) client_sig = _hmac(stored_key, auth_msg, digestmod).digest() client_proof = b"p=" + standard_b64encode(_xor(client_key, client_sig)) client_final = b",".join((without_proof, client_proof)) server_sig = standard_b64encode( _hmac(server_key, auth_msg, digestmod).digest()) cmd = SON([('saslContinue', 1), ('conversationId', res['conversationId']), ('payload', Binary(client_final))]) res = sock_info.command(source, cmd) parsed = _parse_scram_response(res['payload']) if not hmac.compare_digest(parsed[b'v'], server_sig): raise OperationFailure("Server returned an invalid signature.") # A third empty challenge may be required if the server does not support # skipEmptyExchange: SERVER-44857. if not res['done']: cmd = SON([('saslContinue', 1), ('conversationId', res['conversationId']), ('payload', Binary(b''))]) res = sock_info.command(source, cmd) if not res['done']: raise OperationFailure('SASL conversation failed to complete.')
def get_logged_task_info(client, task_id): # task name task_name = client.find_one({ "_meta.task_id": task_id, "_meta.task_name": { "$exists": True } }) if task_name is not None: task_name = task_name["_meta"]["task_name"] # date task_start = client.find({ "_meta.task_id": task_id }).sort([("$natural", pymongo.ASCENDING)]).limit(1) if task_start.alive: task_start = task_start.next()["_meta"]["inserted_at"].strftime( "%Y/%m/%d %H:%M:%S") else: task_start = "N/A" task_end = client.find({ "_meta.task_id": task_id }).sort([("$natural", pymongo.DESCENDING)]).limit(1) if task_end.alive: task_end = task_end.next()["_meta"]["inserted_at"].strftime( "%Y/%m/%d %H:%M:%S") else: task_end = "N/A" # count data_size = client.find({"_meta.task_id": task_id}).count() # stat stat = client.aggregate([ { "$match": { "_meta.task_id": task_id } }, { "$group": { "_id": "$_meta.stored_type", "size": { "$sum": 1 } } }, { "$sort": SON([("_id", pymongo.ASCENDING)]) }, ]) if stat["ok"] != 1.0: stat = None else: stat = {d["_id"]: d["size"] for d in stat["result"]} return { "Task ID": task_id, "Task Name": task_name, "Date": task_start + " - " + task_end, "Data Size": data_size, "Data": stat, }
def speculate_command(self): cmd = SON([('authenticate', 1), ('mechanism', 'MONGODB-X509')]) if self.credentials.username is not None: cmd['user'] = self.credentials.username return cmd
def printDbStats(self): #data01 = ( self.db.command( { "serverStatus" : 1, "repl": 0, "metrics": 0, "locks": 1, "wiredTiger": 0 } ) ) data01 = {} data01 = ( self.db.command( "serverStatus" )) Host01 = data01["host"][0:14] Version01 = data01["version"] Connections01 = data01["connections"]["current"] Warning = data01["asserts"]["warning"] UMess = data01["asserts"]["user"] MaxMem = data01["wiredTiger"]["cache"]["maximum bytes configured"] CurrMem = data01["wiredTiger"]["cache"]["bytes currently in the cache"] Inser = data01["opcounters"]["insert"] query = data01["opcounters"]["query"] Updat = data01["opcounters"]["update"] delet = data01["opcounters"]["delete"] getmo = data01["opcounters"]["getmore"] comma = data01["opcounters"]["command"] Scan = data01["metrics"]["operation"]["scanAndOrder"] WConfl = data01["metrics"]["operation"]["writeConflicts"] CurTimeout = data01["metrics"]["cursor"]["timedOut"] """ print("\n\n"+"="*20,"\n"+"="*20) print("="*2 +" "+ Host01 +" "+ self.thetime() +" "+"="*2) print("="*63) template01="%15s%8s%10s%15s%15s" header01=('Host','Version','Cur_Conn','#ofWarning','#ofUserMessage') print( template01 % header01) print("="*63) print( template01 % (Host01,Version01,Connections01,Warning,UMess)) template02="%12s%12s%12s%12s%12s%12s%12s%12s" header02=('MaxMem MB','CurrMem MB','insert','query','update','delete','getmore','command') print( template02 % header02) print("="*96) print( template02 % (MaxMem,CurrMem,Inser,query,Updat,delet,getmo,comma)) template03="%15s%15s%15s" header03=('scanAndOrder','writeConflicts','CursorTimedOut') print( template03 % header03) print("="*45) print( template03 % (Scan,WConfl,CurTimeout)) """ """ self.matr01={'TS': int(self.thetime()) ,'Host': Host01, 'Version': Version01, 'CurrConn': Connections01, 'NofWarning': Warning, 'NofUserMessage': UMess, 'MaxMem': MaxMem, 'CurrMem': CurrMem, 'Insert': Inser, 'Query': query, 'Update': Updat, 'Delete': delet, 'Getmore': getmo, 'Command': comma, 'ScanAndOrder': Scan, 'WriteConflicts': WConfl, 'CursorTimedOut': CurTimeout } """ self.matr01=SON([('TS', int(self.thetime()) ) ,('Host', Host01), ('Version', Version01), ('CurrConn', Connections01), ('NofWarning', Warning), ('NofUserMessage', UMess), ('MaxMem', MaxMem), ('CurrMem', CurrMem), ('Insert', Inser), ('Query', query), ('Update', Updat), ('Delete', delet), ('Getmore', getmo), ('Command', comma), ('ScanAndOrder', Scan), ('WriteConflicts', WConfl), ('CursorTimedOut', CurTimeout) ])
def get_imdh_data(lat, long, n, variable): #get list of lat longs start_lat = lat - 0.25 * n end_lat = lat + 0.25 * n start_long = long - 0.25 * n end_long = long + 0.25 * n a1_lat = np.arange(start_lat, lat, 0.25) a2_lat = np.arange(lat, (end_lat + 0.25), 0.25) a1_long = np.arange(start_long, long, 0.25) a2_long = np.arange(long, (end_long + 0.25), 0.25) lats = list(a1_lat) + list(a2_lat) longs = list(a1_long) + list(a2_long) ''' start_lat = lat - 1 * n end_lat = lat + 1 * n start_long = long - 1 * n end_long = long + 1 * n a1_lat = np.arange(start_lat, lat, 1) a2_lat = np.arange(lat, (end_lat + 1), 1) a1_long = np.arange(start_long, long, 1) a2_long = np.arange(long, (end_long + 1), 1) lats = list(a1_lat) + list(a2_lat) longs = list(a1_long) + list(a2_long) ''' # extract data from database online db = config.get_db() imdhist = db.imdhist imdhist.create_index("lt") # 25.0,25.25,25.5,25.75,26.00 92.0,92.25,92.5,92.75,93.0 pipeline = [{ "$match": { "id": variable, "lt": { "$in": lats }, "ln": { "$in": longs } } }, { "$group": { "_id": "$ts", "val": { "$push": "$val" }, "lat": { "$push": "$lt" }, "long": { "$push": "$ln" } } }, { "$sort": SON([("_id", 1)]) }] imdh = list(imdhist.aggregate(pipeline, allowDiskUse=True)) ''' pipeline_temp = [ {"$match": {"id": "t", "lt": {"$in": lats}, "ln": {"$in": longs}}}, {"$group": {"_id": "$ts", "val": {"$push": "$val"}, "lat": {"$push": "$lt"}, "long": {"$push": "$ln"}}}, {"$sort": SON([("_id", 1)])} ] imdh_temp = list(imdhist.aggregate(pipeline_temp, allowDiskUse=True))''' return imdh
def prepare_spec_arguments(spec, arguments, opname, entity_map, with_txn_callback): for arg_name in list(arguments): c2s = camel_to_snake(arg_name) # PyMongo accepts sort as list of tuples. if arg_name == "sort": sort_dict = arguments[arg_name] arguments[arg_name] = list(iteritems(sort_dict)) # Named "key" instead not fieldName. if arg_name == "fieldName": arguments["key"] = arguments.pop(arg_name) # Aggregate uses "batchSize", while find uses batch_size. elif ((arg_name == "batchSize" or arg_name == "allowDiskUse") and opname == "aggregate"): continue # Requires boolean returnDocument. elif arg_name == "returnDocument": arguments[c2s] = getattr(ReturnDocument, arguments.pop(arg_name).upper()) elif c2s == "requests": # Parse each request into a bulk write model. requests = [] for request in arguments["requests"]: if 'name' in request: # CRUD v2 format bulk_model = camel_to_upper_camel(request["name"]) bulk_class = getattr(operations, bulk_model) bulk_arguments = camel_to_snake_args(request["arguments"]) else: # Unified test format bulk_model, spec = next(iteritems(request)) bulk_class = getattr(operations, camel_to_upper_camel(bulk_model)) bulk_arguments = camel_to_snake_args(spec) requests.append(bulk_class(**dict(bulk_arguments))) arguments["requests"] = requests elif arg_name == "session": arguments['session'] = entity_map[arguments['session']] elif (opname in ('command', 'run_admin_command') and arg_name == 'command'): # Ensure the first key is the command name. ordered_command = SON([(spec['command_name'], 1)]) ordered_command.update(arguments['command']) arguments['command'] = ordered_command elif opname == 'open_download_stream' and arg_name == 'id': arguments['file_id'] = arguments.pop(arg_name) elif opname != 'find' and c2s == 'max_time_ms': # find is the only method that accepts snake_case max_time_ms. # All other methods take kwargs which must use the server's # camelCase maxTimeMS. See PYTHON-1855. arguments['maxTimeMS'] = arguments.pop('max_time_ms') elif opname == 'with_transaction' and arg_name == 'callback': if 'operations' in arguments[arg_name]: # CRUD v2 format callback_ops = arguments[arg_name]['operations'] else: # Unified test format callback_ops = arguments[arg_name] arguments['callback'] = lambda _: with_txn_callback( copy.deepcopy(callback_ops)) elif opname == 'drop_collection' and arg_name == 'collection': arguments['name_or_collection'] = arguments.pop(arg_name) elif opname == 'create_collection' and arg_name == 'collection': arguments['name'] = arguments.pop(arg_name) elif opname == 'create_index' and arg_name == 'keys': arguments['keys'] = list(arguments.pop(arg_name).items()) elif opname == 'drop_index' and arg_name == 'name': arguments['index_or_name'] = arguments.pop(arg_name) else: arguments[c2s] = arguments.pop(arg_name)
}, } }, 'topics': { 'pagination': False, 'datasource': { 'source': 'documents', 'aggregation': { 'pipeline': [{ "$unwind": "$topics" }, { "$group": { "_id": "$topics" } }, { "$sort": SON([("_id", 1)]) }] } } }, 'places': { 'pagination': False, 'datasource': { 'source': 'documents', 'aggregation': { 'pipeline': [{ "$unwind": "$places" }, { "$group": { "_id": "$places" }
# TCP_KEEPIDLE and friends. Don't attempt to set the # values there. default = sock.getsockopt(socket.IPPROTO_TCP, sockopt) if default > max_value: sock.setsockopt(socket.IPPROTO_TCP, sockopt, max_value) except socket.error: pass def _set_keepalive_times(sock): _set_tcp_option(sock, 'TCP_KEEPIDLE', _MAX_TCP_KEEPIDLE) _set_tcp_option(sock, 'TCP_KEEPINTVL', _MAX_TCP_KEEPINTVL) _set_tcp_option(sock, 'TCP_KEEPCNT', _MAX_TCP_KEEPCNT) _METADATA = SON([ ('driver', SON([('name', 'PyMongo'), ('version', __version__)])), ]) if sys.platform.startswith('linux'): # platform.linux_distribution was deprecated in Python 3.5. if sys.version_info[:2] < (3, 5): # Distro name and version (e.g. Ubuntu 16.04 xenial) _name = ' '.join( [part for part in platform.linux_distribution() if part]) else: _name = platform.system() _METADATA['os'] = SON([ ('type', platform.system()), ('name', _name), ('architecture', platform.machine()), # Kernel version (e.g. 4.4.0-17-generic).
"item": "帆布", "quantity": 100, "tags": ["棉布"], "size": {"height": 28, "weight": 35.5, "uom": "cm"} } ) result_set1 = db.inventory.find({}) for item in result_set1: pprint(item) # 带条件的查询 from bson.son import SON db.inventory.insert_many([ {"item": "journal", "qty": 25, "size": SON([("h", 14), ("w", 21), ("uom", "cm")]), "status": "A"}, {"item": "notebook", "qty": 50, "size": SON([("h", 8.5), ("w", 11), ("uom", "in")]), "status": "A"}, {"item": "paper", "qty": 100, "size": SON([("h", 8.5), ("w", 11), ("uom", "in")]), "status": "D"}, {"item": "planner", "qty": 75, "size": SON([("h", 22.85), ("w", 30), ("uom", "cm")]), "status": "D"}, {"item": "postcard", "qty": 45,
def command(self, dbname, spec, slave_ok=False, read_preference=ReadPreference.PRIMARY, codec_options=DEFAULT_CODEC_OPTIONS, check=True, allowable_errors=None, check_keys=False, read_concern=None, write_concern=None, parse_write_concern_error=False, collation=None, session=None, client=None, retryable_write=False, publish_events=True): """Execute a command or raise an error. :Parameters: - `dbname`: name of the database on which to run the command - `spec`: a command document as a dict, SON, or mapping object - `slave_ok`: whether to set the SlaveOkay wire protocol bit - `read_preference`: a read preference - `codec_options`: a CodecOptions instance - `check`: raise OperationFailure if there are errors - `allowable_errors`: errors to ignore if `check` is True - `check_keys`: if True, check `spec` for invalid keys - `read_concern`: The read concern for this command. - `write_concern`: The write concern for this command. - `parse_write_concern_error`: Whether to parse the ``writeConcernError`` field in the command response. - `collation`: The collation for this command. - `session`: optional ClientSession instance. - `client`: optional MongoClient for gossipping $clusterTime. - `retryable_write`: True if this command is a retryable write. - `publish_events`: Should we publish events for this command? """ self.validate_session(client, session) if (read_concern and self.max_wire_version < 4 and not read_concern.ok_for_legacy): raise ConfigurationError( 'read concern level of %s is not valid ' 'with a max wire version of %d.' % (read_concern.level, self.max_wire_version)) if not (write_concern is None or write_concern.acknowledged or collation is None): raise ConfigurationError( 'Collation is unsupported for unacknowledged writes.') if self.max_wire_version >= 5 and write_concern: spec['writeConcern'] = write_concern.document elif self.max_wire_version < 5 and collation is not None: raise ConfigurationError( 'Must be connected to MongoDB 3.4+ to use a collation.') if (client or session) and not isinstance(spec, ORDERED_TYPES): # Ensure command name remains in first place. spec = SON(spec) if session: spec['lsid'] = session._use_lsid() if retryable_write: spec['txnNumber'] = session._transaction_id() self.send_cluster_time(spec, session, client) listeners = self.listeners if publish_events else None try: return command(self.sock, dbname, spec, slave_ok, self.is_mongos, read_preference, codec_options, session, client, check, allowable_errors, self.address, check_keys, listeners, self.max_bson_size, read_concern, parse_write_concern_error=parse_write_concern_error, collation=collation) except OperationFailure: raise # Catch socket.error, KeyboardInterrupt, etc. and close ourselves. except BaseException as error: self._raise_connection_failure(error)
def _school(urn): if request.method == 'POST': mongo.db['school-address'].find_one_and_update({'school': urn}, { '$set': { 'school': urn, 'address': request.form['address'], 'address-match': 'byhand' } }, upsert=True) return redirect("/school/" + urn, code=303) edubase = latest(mongo.db.edubase.find({'URN': urn})) if not edubase: return abort(404) key = uprn = postcode = '' address = street = {} addresses = parents = children = streets = [] key = '' doc = mongo.db['school-address'].find_one({'school': urn}) if doc: key = doc['address'] if key != '': key = key.split(";")[0] uprn = decode(key) addresses = llist(mongo.db.address.find({'address': key})) address = latest(addresses) if address: street = latest(mongo.db.street.find({'street': address['street']})) children = sorted_naturally( llist(mongo.db.address.find({'parent-address': key}))) parents = address_parents(address) addresses = addresses + children + parents postcode = mongo.db['address-postcode'].find_one({'address': key})['postcode'] point = [] if edubase['Easting']: lat, lon = pyproj.transform(osgb36, wgs84, edubase['Easting'], edubase['Northing']) point = [lon, lat] addresses = addresses + llist( mongo.db.address.find({ 'point': SON([('$nearSphere', [lat, lon]), ('$maxDistance', 0.00004)]) })) streets = streets + llist( mongo.db.street.find({ 'point': SON([('$nearSphere', [lat, lon]), ('$maxDistance', 0.00004)]) })) guesses = {} ignore = ['the'] words = n7e(edubase['EstablishmentName'], ignore).split() words = words + [ 'school', 'academy', 'infant', 'junior', 'middle', 'college', 'jmi', 'campus' ] for a in addresses: if set(words).intersection(set(n7e(a['name'], ignore).split())): guesses[a['address'] + ":" + a['name']] = a guesses = [guesses[k] for k in sorted(guesses)] return render_template("school.html", edubase=edubase, guesses=guesses, point=point, address=address, addresses=addresses, streets=streets, street=street, postcode=postcode, uprn=uprn, parents=parents, children=children)
async def test_query_embedded_documents(self): db = self.db # Start Example 14 # Subdocument key order matters in a few of these examples so we have # to use bson.son.SON instead of a Python dict. from bson.son import SON await db.inventory.insert_many([{ "item": "journal", "qty": 25, "size": SON([("h", 14), ("w", 21), ("uom", "cm")]), "status": "A" }, { "item": "notebook", "qty": 50, "size": SON([("h", 8.5), ("w", 11), ("uom", "in")]), "status": "A" }, { "item": "paper", "qty": 100, "size": SON([("h", 8.5), ("w", 11), ("uom", "in")]), "status": "D" }, { "item": "planner", "qty": 75, "size": SON([("h", 22.85), ("w", 30), ("uom", "cm")]), "status": "D" }, { "item": "postcard", "qty": 45, "size": SON([("h", 10), ("w", 15.25), ("uom", "cm")]), "status": "A" }]) # End Example 14 # Start Example 15 cursor = db.inventory.find( {"size": SON([("h", 14), ("w", 21), ("uom", "cm")])}) # End Example 15 self.assertEqual(await count(cursor), 1) # Start Example 16 cursor = db.inventory.find( {"size": SON([("w", 21), ("h", 14), ("uom", "cm")])}) # End Example 16 self.assertEqual(await count(cursor), 0) # Start Example 17 cursor = db.inventory.find({"size.uom": "in"}) # End Example 17 self.assertEqual(await count(cursor), 2) # Start Example 18 cursor = db.inventory.find({"size.h": {"$lt": 15}}) # End Example 18 self.assertEqual(await count(cursor), 4) # Start Example 19 cursor = db.inventory.find({ "size.h": { "$lt": 15 }, "size.uom": "in", "status": "D" }) # End Example 19 self.assertEqual(await count(cursor), 1)
def graph(): categories = request.get_json(force=True)['categories'] print categories category = categories[0]['category'] sub_category = categories[1]['category'] sort_by = categories[0]['sort'] if sort_by == 'alpha': sort_by = '_id.' + category sort_direction = int(categories[0]['sortDirection']) sort = (sort_by, sort_direction) filters = categories[0]['filter'] first_group_stage = { '$group': { '_id': { category: '$' + category }, 'count': { '$sum': 1 } } } second_group_stage = None if sub_category != '': first_group_stage['$group']['_id'][sub_category] = '$' + sub_category second_group_stage = { '$group': { '_id': { category: '$_id.' + category, }, 'data': { '$push': { sub_category: '$_id.' + sub_category, 'count': '$count' } }, 'count': { '$sum': '$count' } } } sort_stage = {'$sort': SON([sort])} client = pymongo.MongoClient(os.environ['MONGO_URI']) db = client.va_circuit_court data = None if second_group_stage is None: data = db.criminal_cases.aggregate([first_group_stage, sort_stage])['result'] else: data = db.criminal_cases.aggregate( [first_group_stage, second_group_stage, sort_stage])['result'] sub_category_names = [] if sub_category != '': for group in data: for sub_category_group in group['data']: sub_category_name = 'None' if sub_category in sub_category_group: sub_category_name = sub_category_group[sub_category] if sub_category_name not in sub_category_names: sub_category_names.append(sub_category_name) group[sub_category_name] = sub_category_group['count'] print pprint(data) pprint(sub_category_names) values = [str(x['_id'][category]) for x in data] labels = [v for v in values if v not in filters][:20] bar_chart = pygal.Bar(height=450, style=LightStyle, x_label_rotation=70) bar_chart.title = 'VA Circuit Court Cases in 2014' bar_chart.x_labels = labels if sub_category == '': bar_chart.add(category, [ x['count'] for x in data if str(x['_id'][category]) not in filters ][:20]) else: for item in sub_category_names[:10]: item_counts = [] for x in data: if str(x['_id'][category]) in filters: continue if item in x: item_counts.append(x[item]) else: item_counts.append(0) bar_chart.add(item, item_counts[:20]) return str(render_template('stats_filters.html', category=category, filter_values=sorted(values), filters_unchecked=filters)) + \ bar_chart.render()
def __send_message(self, operation): """Send a query or getmore operation and handles the response. If operation is ``None`` this is an exhaust cursor, which reads the next result batch off the exhaust socket instead of sending getMore messages to the server. Can raise ConnectionFailure. """ client = self.__collection.database.client listeners = client._event_listeners publish = listeners.enabled_for_commands from_command = False start = datetime.datetime.now() def duration(): return datetime.datetime.now() - start if operation: kwargs = { "read_preference": self.__read_preference, "exhaust": self.__exhaust, } if self.__address is not None: kwargs["address"] = self.__address try: response = client._send_message_with_response(operation, **kwargs) self.__address = response.address if self.__exhaust: # 'response' is an ExhaustResponse. self.__exhaust_mgr = _SocketManager(response.socket_info, response.pool) cmd_name = operation.name reply = response.data rqst_id = response.request_id from_command = response.from_command except AutoReconnect: # Don't try to send kill cursors on another socket # or to another server. It can cause a _pinValue # assertion on some server releases if we get here # due to a socket timeout. self.__killed = True raise else: # Exhaust cursor - no getMore message. rqst_id = 0 cmd_name = 'getMore' if publish: # Fake a getMore command. cmd = SON([('getMore', self.__id), ('collection', self.__collection.name)]) if self.__batch_size: cmd['batchSize'] = self.__batch_size if self.__max_time_ms: cmd['maxTimeMS'] = self.__max_time_ms listeners.publish_command_start( cmd, self.__collection.database.name, 0, self.__address) try: reply = self.__exhaust_mgr.sock.receive_message(None) except Exception as exc: if publish: listeners.publish_command_failure( duration(), _convert_exception(exc), cmd_name, rqst_id, self.__address) if isinstance(exc, ConnectionFailure): self.__die() raise try: docs = self._unpack_response(response=reply, cursor_id=self.__id, codec_options=self.__codec_options) if from_command: first = docs[0] client._receive_cluster_time(first, self.__session) helpers._check_command_response(first) except OperationFailure as exc: self.__killed = True # Make sure exhaust socket is returned immediately, if necessary. self.__die() if publish: listeners.publish_command_failure( duration(), exc.details, cmd_name, rqst_id, self.__address) # If this is a tailable cursor the error is likely # due to capped collection roll over. Setting # self.__killed to True ensures Cursor.alive will be # False. No need to re-raise. if self.__query_flags & _QUERY_OPTIONS["tailable_cursor"]: return raise except NotMasterError as exc: # Don't send kill cursors to another server after a "not master" # error. It's completely pointless. self.__killed = True # Make sure exhaust socket is returned immediately, if necessary. self.__die() if publish: listeners.publish_command_failure( duration(), exc.details, cmd_name, rqst_id, self.__address) client._reset_server_and_request_check(self.__address) raise except Exception as exc: if publish: listeners.publish_command_failure( duration(), _convert_exception(exc), cmd_name, rqst_id, self.__address) raise if publish: # Must publish in find / getMore / explain command response format. if from_command: res = docs[0] elif cmd_name == "explain": res = docs[0] if reply.number_returned else {} else: res = {"cursor": {"id": reply.cursor_id, "ns": self.__collection.full_name}, "ok": 1} if cmd_name == "find": res["cursor"]["firstBatch"] = docs else: res["cursor"]["nextBatch"] = docs listeners.publish_command_success( duration(), res, cmd_name, rqst_id, self.__address) if from_command and cmd_name != "explain": cursor = docs[0]['cursor'] self.__id = cursor['id'] if cmd_name == 'find': documents = cursor['firstBatch'] else: documents = cursor['nextBatch'] self.__data = deque(documents) self.__retrieved += len(documents) else: self.__id = reply.cursor_id self.__data = deque(docs) self.__retrieved += reply.number_returned if self.__id == 0: self.__killed = True if self.__limit and self.__id and self.__limit <= self.__retrieved: self.__die() # Don't wait for garbage collection to call __del__, return the # socket to the pool now. if self.__exhaust and self.__id == 0: self.__exhaust_mgr.close()