def command(self, command, value=1, check=True, allowable_errors=None, **kwargs): if isinstance(command, basestring): command = {command:value} command.update(**kwargs) if 'filemd5' in command: checksum = md5() for chunk in self.chef.file.chunks.find().sort('n'): checksum.update(chunk['data']) return dict(md5=checksum.hexdigest()) elif 'findandmodify' in command: coll = self._collections[command['findandmodify']] before = coll.find_one(command['query'], sort=command.get('sort')) upsert = False if before is None: upsert = True if command.get('upsert'): before = dict(command['query']) coll.insert(before) else: raise OperationFailure, 'No matching object found' coll.update(command['query'], command['update']) if command.get('new', False) or upsert: return dict(value=coll.find_one(dict(_id=before['_id']))) return dict(value=before) elif 'mapreduce' in command: collection = command.pop('mapreduce') return self._handle_mapreduce(collection, **command) elif 'distinct' in command: collection = self._collections[command['distinct']] key = command['key'] return list(set(_lookup(d, key) for d in collection.find())) else: raise NotImplementedError, repr(command)
def pymongo_query(query, logtype, local=False): """ generic Django view for netshed, takes in query dictionary and type of log and returns the resulting lines of logs """ loglines = [] collections = [] if 'limit' in query: limit = int(query['limit']) # pymongo raw query if 'date' in query: if query['date'] == 'all': collections = connect_db(logtype, local).collection_names() else: collections.append(logtype + '_' + query['date']) try: # search two days if start time > end time if span_two_days(query['start_hr'], query['start_min'], query['end_hr'], query['end_min']): collections.append(logtype + increment_date(query['date'])) except KeyError: pass # get collection object for each collection to search on and query for collection in collections: collection = connect_collection(logtype, collection, local) results = [log for log in collection.find(format_query_input(query)).limit(limit)] results = sorted(results, key=lambda log:log['time']) loglines += [log['line'] for log in results] return loglines
def get_available_rooms_db(bookedDates, startTime, endTime ): roomArray = [] bookedRooms = collection.find({"booked": {'$elemMatch': {'date': bookedDates , 'startTime' : startTime, 'endTime' : endTime} }}) for bookedRoom in bookedRooms : roomArray.append(bookedRoom["roomName"]) return (roomArray)
def func(): # 每2s执行一次 aa = '' datas = (list(collection.find({}, {"_id": 0}))) for userid in datas: daili() try: time.sleep(5) # userid = 'trumpchinese1' userid = userid['userName'] paramsTwo = (('variables', '{"screen_name":"' + userid + '","withHighlightedLabel":true}'), ) paramss = dict(paramsTwo) response = ss.get( 'https://twitter.com/i/api/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName', headers=headerss, params=paramss) content = response.content id = re.compile('"rest_id":"(.*?)",').findall(str(content)) httpId = id[0] article(httpId, params, aa) threading.Timer(2, func).start() except Exception as err: pass
def set_features(collection_name): raw_list = [] raw_res = [] raw_ids = [] counter = Counter(100) collection = client.wiki[collection_name] # type: collection.Collection for raw in collection.find({}): if "rwords" not in raw or len(raw["rwords"]) == 0: continue if len(raw["revs"]) <= 1 or "f" not in raw or len(raw["f"]) < 25: continue if any(f not in raw["f"] for f in OK_FEATURES): continue raw_list.append([x for n, x in raw["f"].items() if n in OK_FEATURES]) raw_res.append(1 if raw["vandal"] else 0) raw_ids.append(raw["_id"]) counter.tick() pred = frst.predict_proba(raw_list) for i, x in enumerate(pred[:, 1]): collection.update_one({"_id": raw_ids[i]}, {"$set": { "f.forest_score": x }})
def check_reference_is_valid(collection: pymongo.collection.Collection, id_: bson.ObjectId) -> bool: """ Check if a given id exists within the given collection :param collection: The pymongo collection to search :param id_: The id to find :return: True if the id exists within the collection, false otherwise """ return collection.find({'_id': id_}).count() > 0
def lista_paises1(): data = collection.find( {'$or': [{ "languages.eng": "English" }, { "languages.eng": "Spanish" }]}) return ({'mensaje': 'correcto'})
def command(self, command, value=1, check=True, allowable_errors=None, **kwargs): if isinstance(command, six.string_types): command = {command: value} command.update(**kwargs) if 'filemd5' in command: checksum = md5() for chunk in self.chef.file.chunks.find().sort('n'): checksum.update(chunk['data']) return dict(md5=checksum.hexdigest()) elif 'findandmodify' in command: coll = self._collections[command['findandmodify']] before = coll.find_one(command['query'], sort=command.get('sort')) upsert = False if before is None: upsert = True if command.get('upsert'): before = dict(command['query']) coll.insert(before) else: raise OperationFailure('No matching object found') coll.update(command['query'], command['update']) if command.get('new', False) or upsert: return dict(value=coll.find_one(dict(_id=before['_id']))) return dict(value=before) elif 'mapreduce' in command: collection = command.pop('mapreduce') return self._handle_mapreduce(collection, **command) elif 'distinct' in command: collection = self._collections[command['distinct']] key = command['key'] filter = command.get('filter') return list( set(_lookup(d, key) for d in collection.find(filter=filter))) elif 'getlasterror' in command: return dict(connectionId=None, err=None, n=0, ok=1.0) elif 'collstats' in command: collection = self._collections[command['collstats']] # We simulate everything based on the first object size, # doesn't probably make sense to go through all the objects to compute this. # Also instead of evaluating their in-memory size we use pickle # as python stores references. first_object_size = len( pickle.dumps(next(iter(collection._data.values()), {}))) return { "ns": '%s.%s' % (collection.database.name, collection.name), "count": len(collection._data), "size": first_object_size * len(collection._data), "avgObjSize": first_object_size, "storageSize": first_object_size * len(collection._data) } else: raise NotImplementedError(repr(command))
def basic_map_reduce_test(self): map = Code(open('./map.js', 'r').read()) reduce = Code(open('./reduce.js', 'r').read()) result = self.collection.map_reduce(map, reduce, {"query": {}}) # result = self.collection.map_reduce(map, reduce); print result collection = self.db[result["result"]] for item in collection.find(): print item
def get_notebook_filenames(self, notebook): """return would-be filenames with the notebook_*.note pattern""" self.db = self.check_connection() collection = self.db[notebook] uids = collection.find().distinct('_id') file_list = [] for noteid in uids: file_list.append(notebook+'_'+str(noteid)+'.note') return file_list
def basic_map_reduce_test(self): map = Code(open('./map.js','r').read()); reduce = Code(open('./reduce.js','r').read()); result = self.collection.map_reduce(map, reduce, {"query":{}}); # result = self.collection.map_reduce(map, reduce); print result; collection = self.db[result["result"]]; for item in collection.find(): print item;
def find(self, table, dic): ''' :param table: str 数据库中的集合 :param dic: dict 查询条件 :return: list 返回查询到记录的列表 ''' collection = self.db[table] rep = list(collection.find(dic)) return rep
def get_all_routes(): global client try: db = client['route'] collection = db['routeCollection'] routes = [route for route in collection.find()] for route in routes: route['_id'] = str(route['_id']) return routes except Exception as e: print('Exception in get all routes: ' + str(e))
def index(): client = MongoClient('localhost', 27017) print(client) # 成功则说明连接成功 # 用户验证 连接mydb数据库,账号密码认证 db = client.article # 连接对应的数据库名称,系统默认数据库admin # db.authenticate('root', '你的密码password') # 连接所用集合,也就是我们通常所说的表 collection = db.article_list_zhongguoyouzheng datas = (list(collection.find({"push_state": 0}, {"push_state": 0, "only_id": 0}))) data = json.dumps(datas, cls=JSONEncoder,ensure_ascii=False) return data
def get_route(source_city: str, dest_city: str): global client try: db = client['route'] collection = db['routeCollection'] routes = [ route for route in collection.find({ 'source_city': source_city, 'dest_city': dest_city }) ] if len(routes) > 0: return routes[0] return None except Exception as e: print('Exception in get route: ' + str(e))
def get_all_flights_by_route(): global client flights = [] try: route_db = client['route'] route_collection = route_db['routeCollection'] flight_db = client['flight'] collection = flight_db['flightCollection'] for each in route_collection.find(): flights.extend(flight for flight in collection.find( {'route_id': ObjectId(each['_id'])})) # print(flights) return flights except Exception as e: print('Exception in get flights by route: ' + str(e))
def set_features(collection_name): raw_list = [] raw_res = [] raw_ids = [] raw_list_opp = [] counter = Counter(100) collection = client.wiki[collection_name] # type: collection.Collection for raw in collection.find({}, { "_id": 1, TEXT_FEATURE_KEY: 1, "vandal": 1 }): if TEXT_FEATURE_KEY not in raw: # or len(raw[TEXT_FEATURE_KEY]) == 0: continue filtered = {x: sign(y) for x, y in raw[TEXT_FEATURE_KEY].items() } #if not x.isdigit() filtered2 = { x: 1 for x, y in filtered.items() if y > 0 and not check_rgb(x) and ' ' not in x } # raw_list.append(filtered2) raw_list_opp.append({ x: y * (-1) for x, y in filtered.items() if y < 0 and not check_rgb(x) and ' ' not in x }) # #raw_list.append(raw[TEXT_FEATURE_KEY]) raw_res.append(1 if raw["vandal"] else 0) raw_ids.append(raw["_id"]) counter.tick() pred = lr.predict_proba(fh.transform(raw_list)) pred2 = lr2.predict_proba(fh.transform(raw_list_opp)) for i, x in enumerate(pred[:, 1]): collection.update_one( {"_id": raw_ids[i]}, { "$set": { "f.t_biscore": x, #max(x,pred2[i,1]) 'f.t_biscore_opp': pred2[i, 1] } })
def load_data_from_mongo(collection_name): """ Load data from specified collection within mongo yelp database into pandas dataframe. Args: collection_name (string): Name of collection to load. Returns: Dataframe: Semi-flattened mongo collection. """ collection = access_specific_collection(collection_name) data = list(collection.find({})) df = pd.json_normalize(data, errors="ignore") print(df.head(5)) return df
def request_cancel(flight_id: str, date: datetime, email: str): global client try: user_db = client['users'] user_collection = user_db['usersCollection'] usr = user_collection.find_one({'email': email}) cancel_db = client['cancels'] collection = cancel_db['cancelsCollection'] for each in usr['bookings']: if (each['flight_id'] == flight_id and each['date'] == date): item = { 'user_id': usr['_id'], 'flight_id': flight_id, 'e_count': each['e_count'], 'b_count': each['b_count'], 'date': date } d = datetime.date.today() x = int(date[len(date) - 2:]) month = int(date[len(date) - 5:len(date) - 3]) if (x - d.day < 2 and month == d.month): return -2 if (d.month != month and x >= 30 and d.day <= 2): return -2 if (collection.find(item).count() != 0): return -1 result = collection.insert_one(item) print('inserted into db', collection.find_one({'flight_id': flight_id})) return 1 except Exception as e: print('Exception in requesting cancel: ' + str(e))
def get_flights_by_route(source_city: str, dest_city: str): global client route = get_route(source_city=source_city, dest_city=dest_city) if route is None: return try: flight_db = client['flight'] collection = flight_db['flightCollection'] flights = [ flight for flight in collection.find({'route_id': route.get('_id')}) ] for flight in flights: flight['route_id'] = str(flight['route_id']) flight['_id'] = str(flight['_id']) return flights except Exception as e: print('Exception in get flights by route: ' + str(e))
def get_route_from_flight_id(flight_id: str): global client route = [] route_id = "temp" try: route_db = client['route'] route_collection = route_db['routeCollection'] flight_db = client['flight'] collection = flight_db['flightCollection'] for each in collection.find(): if (str(each['_id']) == flight_id): route_id = each['route_id'] for each in route_collection: if (str(each['_id']) == route_id): route.append(each['source_city']) route.append(each['dest_city']) print('route: ', route) return route except Exception as e: print('Exception in get route by flight id: ' + str(e))
def load_data_from_mongo_in_batches(collection_name, batch_size=5000): """ Load data from specified collection within mongo yelp database into pandas dataframe. One batch at a time to avoid memory use errors. Args: collection_name (string): Name of collection to load. batch_size (int): Number to records to load in at one time. Defaults to 5000. Depends on memory usage. Returns: Dataframe: Semi-flattened mongo collection. """ collection = access_specific_collection(collection_name) cursor = collection.find() df = pd.DataFrame() for batch in batched(cursor, 5000): df = df.append(batch, ignore_index=True) return df
def command(self, command, value=1, check=True, allowable_errors=None, **kwargs): if isinstance(command, basestring): command = {command: value} command.update(**kwargs) if 'filemd5' in command: checksum = md5() for chunk in self.chef.file.chunks.find().sort('n'): checksum.update(chunk['data']) return dict(md5=checksum.hexdigest()) elif 'findandmodify' in command: coll = self._collections[command['findandmodify']] before = coll.find_one(command['query'], sort=command.get('sort')) upsert = False if before is None: upsert = True if command.get('upsert'): before = dict(command['query']) coll.insert(before) else: raise OperationFailure, 'No matching object found' coll.update(command['query'], command['update']) if command.get('new', False) or upsert: return dict(value=coll.find_one(dict(_id=before['_id']))) return dict(value=before) elif 'mapreduce' in command: collection = command.pop('mapreduce') return self._handle_mapreduce(collection, **command) elif 'distinct' in command: collection = self._collections[command['distinct']] key = command['key'] return list(set(_lookup(d, key) for d in collection.find())) elif 'getlasterror' in command: return dict(connectionId=None, err=None, n=0, ok=1.0) else: raise NotImplementedError, repr(command)
def set_features(collection_name): raw_list = [] raw_ids = [] counter = Counter(100) collection = client.wiki[collection_name] # type: collection.Collection for raw in collection.find({}): if len(raw["revs"]) <= 1: continue if "rwords" not in raw: continue distr = calculate(raw) collection.update_one( {"_id": raw["_id"]}, {"$set": { "f.t_charscore": BHdist(distr, good_distr) }}) raw_list.append({key: value for key, value in distr.items()}) raw_ids.append(raw["_id"]) counter.tick() '''
def update(self, ii, data_to_send): # Access collection corresponding to the current time-step: collection_name = '%s' % ii try: collection = self.db.create_collection( collection_name, **{ 'capped': True, 'size': 100000 }) except (pymongo.errors.OperationFailure, pymongo.errors.CollectionInvalid): collection = self.db[collection_name] # Push my data: collection.insert({ "rank": self.rank(), 'data': json.dumps(data_to_send) }) #Get data: max_record = len(self.outside_rank_list) cursor = collection.find({'rank': { "$in": self.outside_rank_list }}, cursor_type=CursorType.TAILABLE_AWAIT) result_dict = {} while len(result_dict) < max_record: try: found_document = cursor.next() result_dict[found_document['rank']] = found_document['data'] except StopIteration: pass for source_rank, payload in result_dict.items(): self.received_message_dict_external[source_rank][ii] = json.loads( payload)
def pymongo_query(query, logtype, local=False): """ generic Django view for netshed, takes in query dictionary and type of log and returns the resulting lines of logs """ loglines = [] collections = [] if 'limit' in query: limit = int(query['limit']) # pymongo raw query if 'date' in query: if query['date'] == 'all': collections = connect_db(logtype, local).collection_names() else: collections.append(logtype + '_' + query['date']) try: # search two days if start time > end time if span_two_days(query['start_hr'], query['start_min'], query['end_hr'], query['end_min']): collections.append(logtype + increment_date(query['date'])) except KeyError: pass # get collection object for each collection to search on and query for collection in collections: collection = connect_collection(logtype, collection, local) results = [ log for log in collection.find(format_query_input( query)).limit(limit) ] results = sorted(results, key=lambda log: log['time']) loglines += [log['line'] for log in results] return loglines
def check_collection(collection: pymongo.collection.Collection, db_client: database.client.DatabaseClient): """ Check all the entities in a collection :param collection: :param db_client: :return: """ all_entities = collection.find() for s_entity in all_entities: # patch the entity type if appropriate if '.' not in s_entity['_type']: qual_types = database.entity_registry.find_potential_entity_classes( s_entity['_type']) if len(qual_types) == 1 and qual_types[0] != s_entity['_type']: logging.getLogger(__name__).error( "Entity {0} had unqualified type {1}".format( s_entity['_id'], s_entity['_type'])) collection.update_one({'_id': s_entity['_id']}, {'$set': { '_type': qual_types[0] }}) # Try and deserialize the entity, and validate it if we succeed try: entity = db_client.deserialize_entity(s_entity) except Exception: entity = None logging.getLogger(__name__).error( "Exception occurred deserializing object {0}:\n{1}".format( s_entity['_id'], traceback.format_exc())) if entity is not None and hasattr(entity, 'validate'): if not entity.validate(): logging.getLogger(__name__).error( "Entity {0} ({1}) failed validation".format( entity.identifier, s_entity['_type']))
'?1', 'sec-fetch-dest': 'document', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6', 'cookie': 'xhsTrackerId=5994dcc9-8274-48c7-c897-954a0fc3b1f8; _ga=GA1.2.1321150109.1626320152; smidV2=202107271802381b04791812bfde1e5152773c38891bc800efd1442f4be7e20; xhsTracker=url=index&searchengine=baidu; timestamp2=20210802f5ec7988daec2eeaa389bbfb; timestamp2.sig=5bNd_MzQomdySNZPcQIAzx-IQjEN72xKzJX1QkBNbJ0; _gid=GA1.2.98589560.1627867537; extra_exp_ids=gif_exp1,ques_clt1; _gat=1', } # agentUrl = "http://192.168.1.26:16666/get/" # res = requests.get(agentUrl) # # agenContent = res.content.decode("utf-8") # dataip = re.compile('"proxy": "(.*?)",').findall(str(agenContent)) # ip = dataip[0] # proxy = { # 'https://' + ip, # } # requests.proxies = proxy datas = (list( collection.find({ "push_state": 0 }, { "url": 1, "_id": 0 }).sort([("download_time", -1)]))) response = ss.get( 'https://www.xiaohongshu.com/discovery/item/61011452000000002103e959', headers=headersss) print(response.status_code)
def my_job(): # agentUrl = "http://192.168.1.26:16666/get/" # res = requests.get(agentUrl) # # agenContent = res.content.decode("utf-8") # dataip = re.compile('"proxy": "(.*?)",').findall(str(agenContent)) # ip = dataip[0] # proxy = { # 'https://' + ip, # } # requests.proxies = proxy datas = (list( collection.find({ "push_state": 0 }, { "url": 1, "_id": 0 }).sort([("pub_time", -1)]))) for ur in datas: try: print(ur['url']) response = ss.get(ur['url'], headers=headersss) if response.status_code == 461: print() content = response.content.decode('utf-8') contentWEB = re.compile('相关笔记[\s\S]*?.查看更多').findall(str(content)) id = re.compile('/item/(.*?)"').findall(str(contentWEB[0])) title = re.compile('<p class="desc" .*?>(.*?)</p>').findall( str(contentWEB[0])) likeCount = re.compile( '<span class="counts".*?>(.*?)</span>').findall( str(contentWEB[0])) type = re.compile(' <i class="(.*?)"').findall(str(contentWEB[0])) for a, b, c, d in zip(id, title, type, likeCount): if '万' in str(d): num = d.replace('万', '') d = float(num) * 10000 d = int(d) try: imgText = '' videoText = '' url = 'https://www.xiaohongshu.com/discovery/item/' + str( a) # url = 'https://www.xiaohongshu.com/discovery/item/60f8be50000000002103c8f8' response = ss.get( url, headers=headersss, ) content = response.content.decode('utf-8') articleContent = re.compile( '"description": "(.*?)",').findall(str(content)) time = re.compile('发布于 (.*?)</span>').findall(str(content)) pubTime = datetime.datetime.strptime( time[0], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M:%S') if c == 'normal': contentText = re.compile( '<body>([\s\S]*?.)<h1').findall(str(content)) contentImg = re.compile(':url\((.*?)\?').findall( str(contentText)) for i in contentImg: if str.find(str(i), 'http'): print() else: imgText += "<img src='" + str(i) + "'></br>" else: content = response.content.decode('utf-8') content = re.compile( '<video [\s\S]*?.src="(.*?)"').findall( str(content)) ac = content[0].encode('utf-8').decode( 'unicode_escape') aa = str(ac) aa = aa.replace('&', '&') videoText += '<video src="' + aa + '" controls="controls"></br>' contentText = imgText + videoText + articleContent[0] site = "小红书" siteId = 1048926 data = [] articleStatue = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data.append( InsertOne({ "url": url, "title": b, "pub_time": pubTime, "content": contentText, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": a, 'push_state': articleStatue, 'like_num': d })) insertdb(data) except Exception as err: import traceback traceback.print_exc() pass except Exception as err: import traceback traceback.print_exc() pass
def my_job(): # 读取数据 try: datas = (list(collection.find({"Code": {"$ne": 200}}))) agentUrl = "http://47.96.91.228:82/get/" res = requests.get(agentUrl) agenContent = res.content.decode("utf-8") dataip = re.compile('"proxy": "(.*?)",').findall(str(agenContent)) ip = dataip[0] proxy = { 'https://': ip, } for line in datas: paramsss = dict(line) url = paramsss['url'] url = url.replace("www.", "") urlName = paramsss['urlName'] httpUrls = "http://" + url httpUrl = "http://www." + url httpsUrl = "https://www." + url data = [] try: res = requests.get(httpUrls, verify=False, proxies=proxy, allow_redirects=True, timeout=5) httpCode = res.status_code xinUrl = res.url if httpCode == 200: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": httpCode })) insertdb(data) else: if httpCode != 404: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": httpCode })) insertdb(data) else: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": 0 })) insertdb(data) except Exception as err: try: res = requests.get(httpUrl, verify=False, proxies=proxy, allow_redirects=True, timeout=5) httpCode = res.status_code xinUrl = res.url if httpCode == 200: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": httpCode })) insertdb(data) else: if httpCode != 404: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": httpCode })) insertdb(data) else: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": 0 })) insertdb(data) except Exception as err: try: res = requests.get(httpsUrl, verify=False, proxies=proxy, allow_redirects=True, timeout=5) httpCode = res.status_code xinUrl = res.url if httpCode == 200: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": httpCode })) insertdb(data) else: if httpCode != 404: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": httpCode })) insertdb(data) else: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": xinUrl, "Code": 0 })) insertdb(data) except Exception as err: data.append( InsertOne({ "urlName": urlName, "url": url, "xinUrl": url, "Code": 0 })) insertdb(data) except Exception as err: traceback.print_exc() pass
''' Created on May 24, 2018 @author: vikasy ''' import pymongo import pprint from pymongo import MongoClient, collection client = MongoClient() client = MongoClient('mongodb://localhost:27017/') db = client.meetingrooms collection = db.meetingRoomDetails try : bookedRoomsDetails = collection.find({"booked": {'$elemMatch': {'date': '2018-05-25' , 'startTime' : '15:30', 'endTime' : '16:30'} }}) for posts in bookedRoomsDetails : pprint.pprint(posts) print("Rows fetched ") except Exception as e : print (e) #bookedRoomsDetails = collection.find({'$not':[{'availableSlots': {'$elemMatch': {'startTime':startTime , 'endTime':endTime, 'isAvailable' : 0}}}]})