def combine_files(): """ The function is used to combine all the reviews for a business and write the business_id, the combined text of all the reviews and the category into a JSON file :return: None """ restaurant_collection = MongoClient('localhost', 29017).yelp.restaurants review2_collection = MongoClient('localhost', 29017).yelp.review2 output_file = open("RestaurantCombinedReviews.json", 'w') cursor = restaurant_collection.find() line = 0 for entry in cursor: business_id = entry["business_id"] category = entry["categories"] review2_cursor = review2_collection.find({"business_id": business_id}) review_text = "" for business_entry in review2_cursor: review_text = review_text + business_entry["text"] # print json.dumps(vars(obj)) if review_text: line += 1 obj = Business(business_id, review_text, category) output_file.write(json.dumps(vars(obj))) output_file.write("\n") if line % 100 == 0: print line output_file.close()
def main(): x=7; input = open(sys.argv[1], encoding='utf-8') words = MongoClient().db.words words2 = MongoClient().db.words2 output = open("output.txt","w", encoding='utf-8') for line in input: prev=None pprev=None for word in line.split(): if prev!=None: if words.find_one({"first": prev,"second": word,"grade": { "$exists": True}}) != None: words.update({"first": prev,"second": word},{ "$inc": {"grade":1}}) else: words.insert({"first": prev,"second": word,"grade":1}) if pprev!=None: if words2.find_one({"first": pprev,"second": prev,"third": word,"grade": { "$exists": True}}) != None: words2.update({"first": pprev,"second": prev,"third": word},{ "$inc": {"grade":1}}) else: words2.insert({"first": pprev,"second": prev,"third": word,"grade":1}) pprev=prev prev = word for i in words.find(): output.write("first: "+str(i["first"])+" second: "+str(i["second"])+" grade: "+str(i["grade"])+"\n") for i in words2.find(): output.write("first: "+str(i["first"])+" second: "+str(i["second"])+" third: "+str(i["third"])+" grade: "+str(i["grade"])+"\n")
class MongoCorpus(SimpleCorpus): """ Corpus wrapper around a MongoDB collection. Subset corpus by setting a query. If "aggregate" is used, this will override "query". In this case use "$match" in aggregation method. """ def __init__(self, db, collection, aggregate=[], query={}): self.client = MongoClient()[db][collection] self.aggregate_arg = aggregate self.find_arg = query def __iter__(self): """ _obj_ is a dictionary: you can filter the right key to feed only docs text. """ collection = self.client.find(self.find_arg, no_cursor_timeout=True) \ if len(self.aggregate_arg) == 0 \ else self.client.aggregate(self.aggregate_arg) for doc in collection: yield doc collection.close() def __len__(self): if len(self.aggregate_arg) == 0: return self.client.find(self.find_arg).count() else: d = next(self.client.aggregate(self.aggregate_arg + [{"$group": {"_id": "null", "count": {"$sum": 1}}}])) return d['count']
def pull_mongo_data(db, col): # Connect to MongoDB cur = MongoClient()[db][col] # Query DB, looking for all sources to take inventory db_results = cur.find({}, { "source" : 1 } ) # Gather unique topics all_sources = list(set([story["source"] for story in db_results])) stories = [] for source in all_sources: # Query the DB for all results that match the given source db_results = cur.find({ "source" : source }).limit(stories_per_agency) print source + " " + str(db_results.count()) stories.extend(db_results) # Filter results based on alignment if using 'aln_6' if (predict == "aln6"): stories = [story for story in stories if story["source"] in alignment_6] # Grab unique links # print len(stories) # unique = list(set([(story['link'], story['source']) for story in stories])) # agency = "washingtonpost" # print agency + ": " + str(len([story[1] for story in unique if story[1] == agency])) return stories
def get_restaurant_reviews(): """ The function is used to get the reviews that corresponds to the restaurants business. The number of reviews would be 49876. The details are written to a JSON file. :return: None """ restaurant_collection = MongoClient('localhost', 29017).yelp.restaurants review_collection = MongoClient('localhost', 29017).yelp.review2 restaurant_cursor = restaurant_collection.find({},{"business_id": 1, "_id": 0}) output_file = open("RestaurantReviews.json", "w") line = 0 for res_entry in restaurant_cursor: business_id = res_entry["business_id"] review_cursor = review_collection.find({"business_id": business_id}) for review_entry in review_cursor: business_id = review_entry["business_id"] text = review_entry["text"] stars = review_entry["stars"] if text: line += 1 obj = Review(business_id, text, stars) output_file.write(json.dumps(vars(obj))) output_file.write("\n") if line % 100 == 0: print line
class TestBucketIntegration(unittest.TestCase): def setUp(self): self.db = database.Database(HOST, PORT, DB_NAME) self.bucket = bucket.Bucket(self.db, BUCKET) self.mongo_collection = MongoClient(HOST, PORT)[DB_NAME][BUCKET] def setup__timestamp_data(self): self.mongo_collection.save({ "_id": 'last', "_timestamp": d_tz(2013, 3, 1), "_week_start_at": d_tz(2013, 2, 25) }) self.mongo_collection.save({ "_id": 'first', "_timestamp": d_tz(2013, 1, 1), "_week_start_at": d_tz(2012, 12, 31) }) self.mongo_collection.save({ "_id": 'second', "_timestamp": d_tz(2013, 2, 1), "_week_start_at": d_tz(2013, 1, 28) }) def tearDown(self): self.mongo_collection.drop() def test_that_records_get_sent_to_mongo_correctly(self): my_record = Record({'foo': 'bar'}) self.bucket.store(my_record) collection = self.mongo_collection.find() assert_that(list(collection), only_contains( has_entries({"foo": "bar"}) )) def test_that_a_list_of_records_get_sent_to_mongo_correctly(self): my_records = [ Record({'name': 'Groucho'}), Record({'name': 'Harpo'}), Record({'name': 'Chico'}) ] self.bucket.store(my_records) collection = self.mongo_collection.find() assert_that(list(collection), only_contains( has_entries({'name': 'Groucho'}), has_entries({'name': 'Harpo'}), has_entries({'name': 'Chico'}) )) def test_period_queries_get_sorted_by__week_start_at(self): self.setup__timestamp_data() query = Query.create(period="week") result = query.execute(self.bucket.repository) assert_that(result.data(), contains( has_entry('_start_at', d_tz(2012, 12, 31)), has_entry('_start_at', d_tz(2013, 1, 28)), has_entry('_start_at', d_tz(2013, 2, 25)) ))
class MongoStore(Store): def __init__(self, db, collection, url='mongodb://localhost'): self.collection = MongoClient(url)[db][collection] def fetch(self, oid): return self.collection.find_one({'_id':oid}) def fetch_all(self): return self.collection.find() def iter_ids(self): for obj in self.collection.find({}, {'_id':True}): yield obj['_id'] def save(self, obj): self.collection.save(obj) def save_many(self, obj_iter): self.collection.insert(obj_iter) def flush(self): self.collection.drop() def delete(self, oid): self.collection.delete_one({'_id':oid})
def POST(self): try: error_message=None data = cherrypy.request.json client = MongoClient(DATABASE_ADDRESS, DATABASE_PORT) db = client[data['project_name']] version = db.diagram_versions.count()+1 db.diagram_versions.insert({'viewpoint':data['viewpoint'], 'diagram_name':data['diagram_name'], 'diagram_version':str(version), 'date':data['date'], 'deleted':False}) if version > 1: # Insert elements from previous version in new one diagram_elements = MongoClient().editor.diagram_elements diagram_connections = MongoClient().editor.diagram_connections elems = list(diagram_elements.find({'viewpoint':data['viewpoint'], 'diagram_name':data['diagram_name'], 'diagram_version':ver, 'deleted':False})) cons = list(diagram_connections.find({'viewpoint':data['viewpoint'], 'diagram_name':data['diagram_name'], 'diagram_version':ver, 'deleted':False})) for elem in elems: elem['diagram_version'] = version diagram_elements.insert(elem) for conn in cons: conn['diagram_version'] = version diagram_connections.insert(conn) return build_response(token=None) except Exception: if not error_message: error_message = EXCEPTION_PROCESSING_ERROR return build_response(error_message=error_message,token=None)
class MongoIterator(object): def __init__(self, uri, db, collection, skip=0, limit=0, filter=None): self._collection = MongoClient(uri)[db][collection] self._skip = skip self._limit = limit self._filter = filter def __iter__(self): return self.stream() def stream(self, conditions=None, projection=None, skip=None, limit=None): proj = {k: 1 for k in projection} if projection else {} if proj: proj.update({'_id': False}) # skip internal id return self._collection.find(conditions or self._filter, proj or None, skip=skip or self._skip, limit=limit or self._limit) def size(self): return self._collection.count() if not self._filter else self._collection.find(self._filter).count() @property def filter(self): return self._filter @filter.setter def filter(self, conditions): self._filter = conditions
class halo: def __init__(self): self.config = json.load(open("../config/config.json")) self.sparql = SPARQLWrapper("http://dbpedia.org/sparql") self.termDB = MongoClient()["semantified"]["terms"] self.halodb = MongoClient()["halo"]["halos"] def run(self,query): try: self.sparql.setQuery(query) self.sparql.setReturnFormat(JSON) result = self.sparql.query() #jsonlayer.use('cjson') body = result.response.read().encode('ascii','ignore') fixed_body = body.decode("ascii") result = jsonlayer.decode(fixed_body) return result["results"]["bindings"] except : print(query) time.sleep(60) return self.run(query) def makeQuery(self,uri,querykey): return self.config[querykey] % (uri) def insert(self,obj): self.halodb.update({"_id":obj["_id"]},obj,True) def isprocessed(self,uri): return len(list(self.halodb.find({"_id" : uri}))) > 0 def getHalo(self,uri): if not self.isprocessed(uri): query = self.makeQuery(uri,"queryone") result = self.run(query) query = self.makeQuery(uri,"querytwo") result.extend(self.run(query)) halo = {} halo["_id"] = uri halo["uri"] = uri halo["halo"] = {} for each in result: halouri = each["aura"]["value"] halo["halo"][halouri.replace(".","$")] = {} obj = {} obj["halouri"] = halouri obj["count"] = each["auraCount"]["value"] obj["label"] = each["label"]["value"] halo["halo"][halouri.replace(".","$")] = obj self.insert(halo) print("processed halo for : " + uri) else : print("previously processed uri : " + uri ) def getdatadb(self): return self.termDB.find(timeout=False) def processhalofromdb(self): data = self.getdatadb() for each in data : alluri = each["allURI"] map(self.getHalo,alluri)
def most_frequent_pairs(topic_id): """ Compute most frequent stemmed word pairs for the specified Reddit topic id. Skip stop words. :param topic_id: topic id (subreddit_id) :type topic_id: str :return: sorted list of most frequent stemmed word pairs (from most to less frequent) """ topwords = dict() # dict used to count frequency for each stemmed word articles = MongoClient().reddit.articles res = articles.find({'subreddit_id': topic_id, 'created_utc': {'$gt': fromtime}}, {'_id': 0, 'title': 1}) for art in res: words = r.sub(' ', art['title'].lower()).split() # alphachars only words = [st.stem(w) for w in words if (w not in stop_words) and (len(w) > 1)] # stemmed words for word in words: if len(word) > 2: # stemmed words longer than 2 chars only if topwords.get(word) is None: topwords[word] = 1 else: topwords[word] += 1 # keep words with frequency >= 3 for k, v in topwords.items(): if v < 3: del topwords[k] # create a set of topwords for each article that contains at least one matrix = dict() for word in topwords: res = articles.find({'$text': {'$search': word}}, {'_id': 0, 'id_reddit': 1}) # full text search for item in res: if matrix.get(item['id_reddit']) is None: matrix[item['id_reddit']] = set([]) # create initial empty set matrix[item['id_reddit']].add(word) # keep sets with cardinality >= 2 for k, v in matrix.items(): if len(v) < 2: del matrix[k] # count frequency for all possible pairs of topwords pairs = dict() for v in matrix.values(): v = sorted(v) for i in range(len(v) - 1): for j in range(i + 1, len(v)): idx = v[i], v[j] if pairs.get(idx) is None: pairs[idx] = 1 else: pairs[idx] += 1 return sorted(pairs.items(), key=itemgetter(1), reverse=True) # sorted from most to less frequent
class TvrainData: def __init__(self): """ Just load data from Mongo. """ self.sequences = MongoClient(os.environ['MONGODB_URL']).tvrain.sequences self.collection = MongoClient(os.environ['MONGODB_URL']).tvrain.articles self.collection.create_index("time") def get_random_articles(self, n): """Returns N of topics for index.html""" articles = self.collection.find().sort("time", 1).skip(random.randint(0, self.collection.count())).limit(n) return list(articles) def get_article_id(self, url): """Get id by url""" return self.collection.find_one({'url': url})['_id'] def get_articles_data(self, articles_urls): """ Get data from MongoDB for articles urls :param articles_urls: ['article_url', ...] :return: list of MongoDB documents """ articles = [] for url in articles_urls: articles.append(self.collection.find_one({'url': url})) return articles def iterate_articles(self, except_articles, skip=0, limit=None, query=None): """ Iteate throw all articles without ids of except articles :param except_articles: list of ids :return: """ if query is None: query = {} if limit is None: data = self.collection.find(query).skip(skip) else: data = self.collection.find(query).skip(skip).limit(limit) for value in data: if value['_id'] not in except_articles: yield value def get_sequences(self): """Return all sequences for train""" return list(self.sequences.find().limit(-1))
def __call__(self, pair, frame=False): """ returns raw chart data from the mongo database, updates/fills the data if needed, the date column is the '_id' of each candle entry, and the date column has been removed. Use 'frame' to restrict the amount of data returned. Example: 'frame=api.YEAR' will return last years data """ # use last pair and period if not specified if not frame: frame = self.api.YEAR * 10 dbcolName = pair + 'chart' # get db connection db = MongoClient()['poloniex'][dbcolName] # get last candle try: last = sorted( list(db.find({"_id": {"$gt": time() - 60 * 20}})), key=itemgetter('_id'))[-1] except: last = False # no entrys found, get all 5min data from poloniex if not last: logger.warning('%s collection is empty!', dbcolName) new = self.api.returnChartData(pair, period=60 * 5, start=time() - self.api.YEAR * 13) else: new = self.api.returnChartData(pair, period=60 * 5, start=int(last['_id'])) # add new candles updateSize = len(new) logger.info('Updating %s with %s new entrys!', dbcolName, str(updateSize)) # show the progess for i in range(updateSize): print("\r%s/%s" % (str(i + 1), str(updateSize)), end=" complete ") date = new[i]['date'] del new[i]['date'] db.update_one({'_id': date}, {"$set": new[i]}, upsert=True) print('') logger.debug('Getting chart data from db') # return data from db (sorted just in case...) return sorted( list(db.find({"_id": {"$gt": time() - frame}})), key=itemgetter('_id'))
class DisIO: def __init__(self): self.db = MongoClient( 'localhost', 27017).get_database('orig').get_collection('sentences') def sen_from_mongo(self): cursor = self.db.find({}) count = 0 sen_all = "" for sen in cursor: sen_all = sen_all + sen['text'] count += 1 if count % 10000 == 0: print("mongo" + str(count)) return sen_all def re_to_text(self, cut): length = len(cut) if (length == 0): print("NO Results") else: jieba_sum = 0.0 thulac_sum = 0.0 # dis = open(path, 'a', encoding='utf-8') for i in range(0, length): if i % 10000 == 0: print("dis" + str(i)) jieba_sum += cut[i]["jieba_overlap"] thulac_sum += cut[i]["thulac_overlap"] print("jieba:" + str(jieba_sum / length) + " thulac:" + str(thulac_sum / length) + "\n")
def find_users(): """Busca usuarios en funcion de su nombre, apellidos y fecha de nacimiento """ # http://localhost:8080/find_users?name=Luz # http://localhost:8080/find_users?name=Luz&surname=Romero # http://localhost:8080/find_users?name=Luz&&surname=Romero&birthdate=2006-08-14 name = request.query.name surname = request.query.surname birthdate = request.query.birthdate print (name, surname, birthdate) dicc = dict(request.query) consulta={} for x in dicc: if x == "name": consulta["name"]= name elif x =="surname": consulta["surname"]= surname elif x=="birthdate": consulta["birthdate"]= birthdate else: return template('error.tpl', error=("La consulta no admite el parametro " + str(x))) collection = MongoClient('localhost',27017).giw.usuarios cursor = collection.find(consulta) count = make_table_ten_columns(cursor, "find_users_result") return template('find_users_result.tpl', usuarios = count, name=name, surname=surname, birthdate=birthdate)
class ProductService: def __init__(self): self.schema = Product() self.mongodb_uri = os.getenv('MONGODB_URI') if os.getenv( 'MONGODB_URI') else 'localhost:27017' self.mongodb_name = 'Store' self.mongodb_collection = 'product' self.db_connection = MongoClient(self.mongodb_uri).get_database( self.mongodb_name).get_collection(self.mongodb_collection) def create_product(self, payload): try: data = self.schema.load(payload) created_id = self.db_connection.insert_one(data).inserted_id except ValidationError as validation_error: raise validation_error return created_id def get_product(self, product_id): product = self.db_connection.find_one({ '_id': UUID(product_id), 'enabled': True }) if product: return self.schema.dump(product) raise FileNotFoundError() def get_all_products(self): products = self.db_connection.find({'enabled': True}) if products: return self.schema.dumps(products, many=True) raise FileNotFoundError()
class Ticker(object): def __init__(self, api, interval=1): self.api = api self.db = MongoClient().poloniex['ticker'] self.interval = interval def updateTicker(self): tick = self.api.returnTicker() for market in tick: self.db.update_one({'_id': market}, {'$set': tick[market]}, upsert=True) logger.info('Ticker updated') def __call__(self): return list(self.db.find()) def run(self): self._running = True while self._running: self.updateTicker() sleep(self.interval) def start(self): self._thread = Thread(target=self.run) self._thread.daemon = True self._thread.start() logger.info('Ticker started') def stop(self): self._running = False self._thread.join() logger.info('Ticker stopped')
class ReadThread(Thread): def __init__(self, host, port, db, coll_name, conn_options): Thread.__init__(self) self.coll = MongoClient(host, port, **conn_options)[db][coll_name] def run(self): global ReadsPerSec global ShouldExit ReadsPerSec = 0 interval = timedelta(seconds=1.8) last = datetime.now() things = 0 while not ShouldExit: for i in xrange(10): list(self.coll.find({'not_a_key': {'$ne': 1}})) things += 1 now = datetime.now() elapsed = now - last if elapsed >= interval: ReadsPerSec = things / elapsed.total_seconds() things = 0 last = now
def get_events_ids_by_project_id(project_id): # project_id string query = { 'project_id' : ObjectId(project_id) } fields = { '_id': 1 } event_types_collection = MongoClient(mongo_ip, mongo_port)[mongo_db_name][event_collection_name] cursor = event_types_collection.find(query,fields) return [str(et_id['_id']) for et_id in cursor]
def main(): # Download the necessary information needed for gensim # nltk.download() # Get the count of pages stored in MongoDb pages_db = MongoClient(GlobalSettings.MONGO_URI)[ GlobalSettings.DATABASE_DOT][GlobalSettings.COLLECTION_PAGES] pages_cursor = pages_db.find() count = pages_cursor.count() # Create workers to go through the pages in separate batches workers = GlobalSettings.WORKERS batch = count / workers left = count % workers # Kick off each worker until all batches have completed processing jobs = [] for i in range(workers): size = count / workers if i == (workers - 1): size += left p = multiprocessing.Process(target=worker, args=((i + 1), i * batch, size)) jobs.append(p) p.start() for j in jobs: j.join() print '%s.exitcode = %s' % (j.name, j.exitcode)
def insert_questions_from_followed_question(): in_db = MongoClient().zhihu.user_followed_questions out_db = MongoClient().zhihu_network.questions existed_question_id = set(map(lambda q: q['_id'], out_db.find())) segmentor = Segmentor() segmentor.load("/Users/sunxiaofei/workspace/ltp_data/cws.model") for u in in_db.find(): for q in u['questions']: if q['id'] in existed_question_id: continue existed_question_id.add(q['id']) words = segmentor.segment(q['title'].strip().replace( '\n', ' ').replace('\r', ' ').replace('\b', ' ')) if len(words) < 3: continue out_db.insert({'_id': q['id'], 'title': ' '.join(words)})
class Database: DATABASE = db_config.DATABASE LINK = db_config.MONGO_LINK.format(os.environ.get('LOGIN'), os.environ.get('PASSWORD')) def __init__(self, collection): from pymongo.errors import ConnectionFailure try: self.db = MongoClient(self.LINK)[self.DATABASE][collection] print('Connect to {} successful'.format(collection)) except ConnectionFailure: print('Connecting to {} error'.format(collection)) sys.exit() def add_doc(self, doc): if self.db.find_one({'_id': doc['_id']}) is None: self.db.save(doc) return True return False def change_doc(self, _id, mode): self.db.find_one_and_update(_id, {'$set': mode}) def delete_doc(self, _id): self.db.find_one_and_delete(_id) def get_docs(self, query=None): if query is None: query = {} return self.db.find(query) def get_one_doc(self, _id): return self.db.find_one(_id)
def process_cursor(skip_n, limit_n): print('Starting process', skip_n // limit_n, '...') # Connect to the MongoDB collection = MongoClient().yelp_review.business_data cursor = collection.find({}).skip(skip_n).limit(limit_n) # Connect to the MySQL mydb = mysql.connector.connect(user='******', password='******', database='yelp_review') mycursor = mydb.cursor() # Insert query sql = 'INSERT INTO business (business_id, name, city, state, stars, review_count) VALUES (%s, %s, %s, %s, %s, %s)' # Loop through the cursor for doc in cursor: # Insert values from MongoDB to MySQL if doc['categories'] is None: continue if ('Restaurants' in doc['categories']) and (doc['state'] in ['ON', 'BC']): val = (doc['business_id'], doc['name'].encode("ascii", "ignore").decode(), doc['city'], doc['state'], doc['stars'], doc['review_count']) mycursor.execute(sql, val) mydb.commit() print('Completed process', skip_n // limit_n, '...')
def feature_click(self): col = self.mdb.click if not col.find_one(): logging.info('click为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) ws = MongoClient().wblog.swblog ww = MongoClient().wblog.wblog for wblogId in self.all_wblog: if wblogId in self.swblog: pass else: wblog = ww.find_one({'wblogId': str(wblogId)}) content = wblog['json_text']['text'] if 'ttarticle' in content: print('https:' + content.split('ttarticle')[0].split(':')[-1] + 'ttarticle' + content.split('ttarticle')[1].split('&')[0]) for wblog in ws.find(): content = wblog['json_text']['text'] if 'ttarticle' in content: print('https:' + content.split('ttarticle')[0].split(':')[-1] + 'ttarticle' + content.split('ttarticle')[1].split('&')[0])
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) business_ids=["9-pGDHbyIoP_KhguG6vI1Q","ArtsD3RqfCVjIRSZunIh_g","CVakWZjk_j44AB-Jbe0DPQ","iYk5QEI3IZmr25L3QWz4KQ","7Hmr1TDJah-14zprHUMlqw","J6i_Tt4dI7IUTIG9xaC8cg","7Hmr1TDJah-14zprHUMlqw","cjJvvEbpo9b_76hV_lyFXg","rtqtZ0_kOA-GP33mn6-Kpg","T-LhjPRqlS7hLGRmSMBbfA,9pGDHbyPOP-KhjikG6vI1Q","DohsD3RqfCPjIRSZun_Ihg","POakdwajk-j44ABJbe0DPQ","Yk5QEI3IZmr25L3QWz_4KQ","LOP1TDJah14zprHUM_lqw","POMTt4dI7IUTIG_9xaC8cg","JKLsr1TDJah14zprHUMlqw","PPJvvEbpo9b-76hV_lyFXg","MCd_tZ0kOAGP33mn6K-pg","PLLhjPRqlSfdvfvd-MBbfA"] business_id="T-LhjPRqlS7hLGRmSMBbfA" reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][ Settings.TEST_COLLECTION] reviews_cursor = reviews_collection.find({},{'text':1,'business':1}) reviews_cursor2 = reviews_collection.find({},{'business':1}) for reviews in reviews_cursor: if reviews["business"]==business_id: predict=Predict() predict.run(reviews["text"]) """
def run(self, new_review): topics={0:"trip",1:"experience",2:"location",3:["drinks", "margarita"],4: ["dessert", "fancy", "garden"],5:"parking",6:"facility",7:"con",8:["pizza", "delivery", "pie" ],9:"appointment",10:["bbq", "lunch", "sandwich", "meat", "american"],11:["seafood", "shrimp", "fish", "salmon"], 12:["location", "ice cream"],13:["hotdog", "american"],14:["customer service", "experience" ],15:["pasta", "calamari"],16:"juice",17:["burritto", "student", "refund"],18: ["steak", "meat", "lover"],19:["game", "video"],20:"security",21:["cheese", "salad", "vegetables" ],22:["parking", "tax"],23:["buffet", "price", "quality"],24:"pancake",25:["cake", "cupcake", "bakery"],26:["popcorn", "environment"], 27:"kid-friendly",28:["breakfast", "brunch", "waffle"],29:["music", "friend", "club", "bar"], 30:["hotel", "casino"],31:"architecture",32:"kid-friendly",33:"view",34:["service", "price", "staff"], 35:["chocolate", "candy", "dessert"],36:"crepe",37:["time", "service"],38:"online",39:"pricey",40:"pub", 41:["burger", "order" ,"service"],42:["ambience", "beef"],43:["spicy", "chicken", "meat"],44:["price", "discount"], 45:["wine", "service", "dessert"],46:["bar", "night", "bartender"],47:["taco", "mexican"],48:["cafe", "coffee"], 49:["party", "birthday"] } business_ids=["9-pGDHbyIoP_KhguG6vI1Q","ArtsD3RqfCVjIRSZunIh_g","CVakWZjk_j44AB-Jbe0DPQ","iYk5QEI3IZmr25L3QWz4KQ","7Hmr1TDJah-14zprHUMlqw","J6i_Tt4dI7IUTIG9xaC8cg","7Hmr1TDJah-14zprHUMlqw","cjJvvEbpo9b_76hV_lyFXg","rtqtZ0_kOA-GP33mn6-Kpg","T-LhjPRqlS7hLGRmSMBbfA"] business_id=business_ids[0] nouns = self.extract_lemmatized_nouns(new_review) new_review_bow = self.dictionary.doc2bow(nouns) new_review_lda = self.lda[new_review_bow] reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][ Settings.TEST_COLLECTION] reviews_cursor = reviews_collection.find() answer=[] print (new_review_lda) for i,j in new_review_lda: answer.append(topics[i]) print (new_review_lda) reviews_collection.update({'topic': ""},{'$set':{"topic":answer }})
def find_birth_month(): """Busca usuarios nacidos en un mes en concreto """ # http://localhost:8080/find_birth_month?month=abril month = request.query.month parser = dict({"enero": "-01-", "febrero": "-02-", "marzo": "-03-", "abril": "-04-", "mayo": "-05-", "junio": "-06-", "julio": "-07-", "agosto": "-08-", "septiembre": "-09-", "octubre": "-10-", "noviembre": "-11-", "diciembre": "-12-"}) print (month) dicc = dict(request.query) for x in dicc: if x == "month" and month in parser.keys(): consulta = parser[month] else: return template('error.tpl', error="Los parametros introducidos son incorrectos") print (consulta) collection = MongoClient('localhost',27017).giw.usuarios cursor = collection.find({"birthdate" : {"$regex": consulta}}).sort("birthdate", 1) count = make_table_ten_columns(cursor, "find_birth_month_result") return template('find_birth_month_result.tpl', usuarios = count, month=month)
def get_training_reviews(size_of_training_review=None): """ The function is used to read records from the review2 collection and then create a records in JSON format shown below: JSON format: [ {"text": "Sample Text","label":"1"}, {"text": "Sample Text","label":"2"} ] The JSON data is written to a file. :param size_of_training_review: example 1000, 2000 etc :return: None """ review_collection = MongoClient('localhost', 29017).yelp.review2 review_cursor = review_collection.find() training_file = open("TrainingReviews_2.json", "w") line = 0 training_file.write("[\n") for entry in review_cursor: if line < size_of_training_review: text = entry["text"] rating = entry["stars"] if text: line += 1 obj = TrainReview(text, rating) str_line = json.dumps(vars(obj)) if line < size_of_training_review: training_file.write("\t" + str_line + ",\n") else: training_file.write(str_line) if line % 100 == 0: print line else: break training_file.write("]")
def find_leap_year(): """Busca usuarios nacidos en años bisiestos cuya tarjeta de credito caduque en el año pasado por parametro """ # http://localhost:8080/find_leap_year?exp=20 exp = request.query.exp dicc = dict(request.query) for x in dicc: if x == "exp" and len(exp)==2: consulta = exp else: return template('error.tpl', error="Los parametros introducidos son incorrectos") bisiesto = """function() { if ("birthdate" in this) { let year = Number(this["birthdate"].substr(0, 4)); if (year%4==0 && (!(year%100==0) || (year%400==0))) return true; else return false; } else return false }""" collection = MongoClient('localhost',27017).giw.usuarios cursor = collection.find({"credit_card.expire.year":consulta, "$where": bisiesto}) count = make_table_ten_columns(cursor, "find_leap_year_result") return template('find_leap_year_result.tpl', usuarios=count, exp=consulta)
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 连接MongoDB,读取待分类数据 corpus_collection = MongoClient( "mongodb://39.108.180.114:27017")["ennews"]["news"] reviews_cursor = corpus_collection.find(no_cursor_timeout=True) # 分类 find_topics = FindTopics(corpus_collection, reviews_cursor) topics_matrix = find_topics.run() # 输出主题矩阵 # make a copy of original stdout route stdout_backup = sys.stdout # define the log file that receives your log info log_file = open(".\lda_topics2.log", "w") # redirect print output to log file sys.stdout = log_file print(str(topics_matrix)) log_file.close() # restore the output to initial pattern sys.stdout = stdout_backup reviews_cursor.close()
class RemoteIO: def __init__(self): time_counter(print_to_console=False) print("初始化 RemoteIO") self.db = MongoClient('192.168.68.11', 20000).get_database( "tokenizer_qiao").get_collection('splited_sentences') self.sentence_size = self.db.find().count() self.step = self.sentence_size self.skip = 0 time_counter("初始化完毕") def read_sentence_randomly(self): while self.skip + self.step >= self.sentence_size: print("skip:%d, step:%d, size:%d" % (self.skip, self.step, self.sentence_size)) if self.step == 0: return None self.skip = 0 self.step = int(self.step / 2) if self.step + self.skip < self.sentence_size: random_step = random.randint(0, self.step) # print("获取 skip:%d" % self.skip+random_step) pipeline = [{"$skip": self.skip + random_step}, {"$limit": 1}] self.skip += random_step docs = list(self.db.aggregate(pipeline)) doc = docs[0] if len(docs) > 0 else None self.db.update({"_id": doc["_id"]}, {"$inc": {"analysed": 1}}) time_counter("已获取到") return doc else: return None def read_sentence_from_remote(self): db = self.db return db.find()
def load_tags(size, start): reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.BUSINESS_DATABASE][Settings.REVIEWS_COLLECTION] tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.BUSINESS_DATABASE][Settings.TAGS_REVIEWS_COLLECTION] stopwords = nltk.corpus.stopwords.words('english') puncs = set(string.punctuation) batch_size = 1000 for batch in range(0, size, batch_size): reviews_cursor = reviews_collection.find().skip(start + batch).limit( batch_size) for review in reviews_cursor: words = [] sentences = nltk.sent_tokenize(review["text"].lower()) for sentence in sentences: tokens = nltk.word_tokenize(sentence) content = [ token for token in tokens if token not in stopwords and token not in puncs ] tags = nltk.pos_tag(content) for word, tag in tags: words.append({"word": word, "pos": tag}) tags_collection.insert({ "review_id": review["review_id"], "business_id": review["business_id"], "text": review["text"], "words": words })
def cal_dimensions():#计算各个维度 conn=MongoClient("192.168.4.250",27017)['linkedin']['linkedin_userinfo'] write_location = MongoClient("192.168.2.254", 27017)['shulianxunying']['linkedin_dimension'] count = 0 logfd=open("log_linkedin.txt",'w') for cv in conn.find(timeout=False): post={} count += 1 if count % 1000 == 0: print count BasicInfo_industry_highlight='' BasicInfo_industry_highlight='' if 'BasicInfo' in cv: if "location_highlight" in cv['BasicInfo']: BasicInfo_location_highlight=cv['BasicInfo']['location_highlight'] if "industry_highlight" in cv["BasicInfo"]: BasicInfo_industry_highlight=cv['BasicInfo']['industry_highlight'] positionsMpr=[] if 'positionsMpr' in cv: positionsMpr=cv['positionsMpr'] educationsMpr=[] if 'educationsMpr' in cv: educationsMpr=cv['educationsMpr'] skillsMpr=[] if 'skillsMpr' in cv: skillsMpr=cv['skillsMpr'] honorsMpr=[] if 'honorsMpr' in cv: honorsMpr=cv['honorsMpr'] projectsMpr=[] if 'projectsMpr' in cv: projectsMpr=cv['projectsMpr'] patentsMpr=[] if 'patentsMpr' in cv: patentsMpr=cv['patentsMpr'] #计算工作地点 location='' location=cal_location(BasicInfo_location_highlight,positionsMpr,educationsMpr) if location: post['location']=location else: post['location']='null' #计算职业方向 job='' job=cal_job(positionsMpr,skillsMpr,BasicInfo_industry_highlight) if job: post['job']=job else: post['job']='null' #计算专业能力 ability=0. ability=major_ability(job,skillsMpr,positionsMpr,educationsMpr,honorsMpr,projectsMpr,patentsMpr) if ability: post['ability']=ability else: post['ability']=0. #职业性格(暂时无法刻画) post['character']='null' post['uid']=cv['_id'] write_location.insert(post)
def gen_review_coll_with_id(): """ The function is used to add a sequential id to the records within the review2 collection. This id was required to process the records in parallel when calculating the sentiment. This collection will be used by the code "ParallelProcess.py". The column is added and the record is written to a new collection: review_counter :return: None """ review2_collection = MongoClient('localhost', 29017).yelp.review2 review2_cursor = review2_collection.find() client = MongoClient('localhost', 29017) db = client.yelp review_counter = db.review_counter counter = 1 for entry in review2_cursor: business_id = entry["business_id"] text = entry["text"] stars = entry["stars"] review_id = entry["review_id"] user_id = entry["user_id"] _dict = {"business_id": business_id, "text": text, "stars": stars, "review_id": review_id, "user_id": user_id, "counter": counter} review_counter.insert(_dict) counter += 1 if counter % 100 == 0: print counter
def get_all_from_mongo(dataset): cl = MongoClient('localhost', 27018) cl = cl['lSSVM']['base'] exps = [] for meta in cl.find({'dataset_name': dataset}): exps.append(meta) return exps
def loop(): p = [] while True: p.append(run_osrm(osrm_port.value + 1 - flag.value)) flag.value = 1 - flag.value if len(p) > 1: p[0].terminate() p = p[1:] orders = MongoClient()[db_name][order['collection']['name']] _orders = orders.find({}) # bad smell <- r = [[ *o['receiver']['coordinates'], time.mktime(o['timeline']['init']['at'].timetuple()) ] for o in _orders] rm = Mu(2) if False: rm.extend(np.array(r)) t = [[ *o['transmitter']['coordinates'], time.mktime(o['timeline']['init']['at'].timetuple()) ] for o in _orders] tm = Mu(2) if False: tm.extend(np.array(t)) mus.extend([tm, rm]) while len(mus) > 2: mus.pop(0) mus.pop(0) time.sleep(60 * 60)
def run(host, database, collection, field=None, value=None): graph = MongoClient(host)[database][collection] return bson.json_util.dumps( graph.find({"data.%s" % (field): { "$in": [value] }}))
def save_mongo(item): conn = MongoClient(host='localhost', port=27017)['db_newGroup']['heneng'] count = conn.find({'source_url': item['source_url']}).count() if count == 0: conn.insert(item) else: print('已存在')
class DB: def __init__(self): self.collection = MongoClient().local.connections def REMOVEALL(self): self.collection.delete_many({}) def remove(self, connection, field): if ("ip" in field): self.collection.delete_many({"ip": connection.ip}) elif ("hostname" in field): self.collection.delete_many({"hostname": connection.hostname}) else: return False return True def insert(self, connection): data = json.dumps(connection.socket, -1) self.collection.insert_one({ "ip": connection.ip, "hostname": connection.hostname, "uniq": str(connection.unique) }) def getAllConnectionsPrint(self): darr = [] docs = self.collection.find() for doc in docs: darr.append(doc) return darr def getCollection(self): return self.collection
def show(self): """ 为了界面展示 :return: """ self.all_user = random.sample(self.all_user, 500) self.all_wblog = random.sample(self.all_wblog, 500) for uid in self.all_user: self.retweet_edge[uid] = [] for res in self.sqlhelper.select_sql( 'SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid): paMid = str(res[0]) orMid = str(res[1]) if paMid in self.all_wblog: self.retweet_edge[uid].append(paMid) if orMid in self.all_wblog: self.retweet_edge[uid].append(orMid) mdb = MongoClient().comment.comment for wblogId in self.swblog: for res in mdb.find({'wblogId': wblogId}): try: uid = res['json_text']['user']['id'] if uid in self.retweet_edge.keys(): if wblogId not in self.retweet_edge[uid]: self.retweet_edge[uid].append(wblogId) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
def people(bot, update, args): limit = 5 chat_id = update.message.chat_id collection = MongoClient('mongodb://localhost:27017/')['jacobs']['jpeople'] query_str = " ".join(args) print(f'query for people: {query_str}') results = collection.find({'$text': {'$search': query_str}}) # number of results reply_count = 0 msgs = [] for result in results: if reply_count >= limit: break first_name = result['firstName'] last_name = result['lastName'] email = result['email'] year = result['year'] major = result['majorShort'] college = result['college'] room = result['room'] country = result['country'] reply_str = f'Name: {first_name} {last_name}\n' reply_str += f'Email: {email}, \nYear: {year},\nMajor: {major}, \nCountry: {country}\n' reply_str += f'College: {college},\nRoom: {room}' msgs.append(reply_str) for msg in msgs: bot.send_message(chat_id=chat_id, text=msg)
def _getDocById(self, collObj: MongoClient, userId): """Returns collection object based on the UUID -- returns empty dict for non-existant user""" match = list(collObj.find( {"id": userId})) # "match" because there should only ever be one numMatches = len(match) if numMatches > 0: return match[0] else: return {}
def email_birthdate(): """Busca usuarios que hayan nacido entre dos fechas """ # http://localhost:8080/find_email_birthdate?from=1973-01-01&to=1990-12-31 ini = request.query['from'] fin = request.query['to'] print (ini, fin) dicc = dict(request.query) consulta={} for x in dicc: if x == "from": consulta["from"]= ini elif x =="to": consulta["to"]= fin else: return template('error.tpl', error="Los parametros introducidos son incorrectos") collection = MongoClient('localhost',27017).giw.usuarios query = dict({"birthdate": {"$gte": consulta["from"], "$lt": consulta["to"]}}) cursor = collection.find(query, {"id":1, "email":1, "birthdate":1}) count = make_table_three_columns(cursor, "email_birthdate_result") return template('email_birthdate_result.tpl', usuarios = count, fecha1 = ini, fecha2 = fin)
class DisIO: def __init__(self): self.db = MongoClient('localhost', 20000).get_database('orig').get_collection('sentences') def sen_from_mongo(self): cursor = self.db.find({}) str = "" for sen in cursor: str = str + sen['text'] return str def re_to_text(self, path, cut=[]): jieba_sum = 0.0 thulac_sum = 0.0 dis = open(path, 'a', encoding='utf-8') length = len(cut) for i in range(0, length): jieba_sum += cut[i]["jieba_overlap"] thulac_sum += cut[i]["thulac_overlap"] dis.write("origin: " + cut[i]["sentence"] + "\n") dis.write("result: " + str(cut[i]["result"]) + "\n") dis.write("jieba: " + str(cut[i]["jieba"]) + " " + str(cut[i]["jieba_overlap"]) + "\n") dis.write("thulac: " + str(cut[i]["thulac"]) + " " + str(cut[i]["thulac_overlap"]) + "\n\n") dis.write( "jieba:" + "n/a" if length == 0 else str(jieba_sum / length) + " thulac:" + "n/a" if length == 0 else str( thulac_sum / length) + "\n") dis.close()
def find_likes_not_ending(): """Busca usuarios que no tienen aficiones que acaben en con un sufijo concreto """ # http://localhost:8080/find_likes_not_ending?ending=s ending = request.query.ending print (ending) dicc = dict(request.query) for x in dicc: if x == "ending": consulta = ".*"+ending.lower() else: return template('error.tpl', error="Los parametros introducidos son incorrectos") query = dict({"likes": {"$not": {"$elemMatch": {"$regex": consulta}}}}) collection = MongoClient('localhost',27017).giw.usuarios cursor = collection.find(query) count = make_table_ten_columns(cursor, "find_likes_not_ending_result") return template('find_likes_not_ending_result.tpl', usuarios=count, ending=ending)
class CorpusIO: def __init__(self): self.db = MongoClient( 'localhost', 27017).get_database('chinese').get_collection('train_edges') def read_from_mongo(self, limit=20): cursor = self.db.find({}) cnt = 0 for doc in cursor: if limit is not None and cnt > limit: break cnt += 1 if cnt % 10000 == 0: print(cnt) edge = (doc['src_name'], doc['des_name'], doc['weight']) yield edge def save_as_json(self, corpus_json, path): file = open(path, 'w', encoding='utf-8') # pickle.dump(corpus_json, file) json.dump(corpus_json, file) print('corpus network saved to %s' % path) def load_as_json(self, path): file = open(path, 'r', encoding='utf-8') # json = pickle.load(file, encoding='utf-8') corpus_json = json.load(file) return corpus_json
def test_iteration(self, collection, num_topics, file): dbcollections = MongoClient().twitter[collection] tweets = [] count = 0 unique_hashtags = set() allcount = 0 for tweet in dbcollections.find({}, {"_id": 1, "entities": 1, "text": 1}): allcount += 1 if not tweet.has_key('text'): continue hashtags = [] if tweet.has_key('entities') and tweet['entities'].has_key('hashtags') > 0: for i in range(len(tweet['entities']['hashtags'])): atag = tweet['entities']['hashtags'][i]['text'] hashtags.append(atag) unique_hashtags.add(atag) if len(hashtags) == 0: continue tweets.append((tweet['text'], tweet['_id'], hashtags)) count += 1 if count == 10000: break;
class EventsTestMixin(object): """ Helpers and setup for running tests that evaluate events emitted """ def setUp(self): super(EventsTestMixin, self).setUp() self.event_collection = MongoClient()["test"]["events"] self.event_collection.drop() self.start_time = datetime.now() def assert_event_emitted_num_times(self, event_name, event_time, event_user_id, num_times_emitted): """ Tests the number of times a particular event was emitted. :param event_name: Expected event name (e.g., "edx.course.enrollment.activated") :param event_time: Latest expected time, after which the event would fire (e.g., the beginning of the test case) :param event_user_id: user_id expected in the event :param num_times_emitted: number of times the event is expected to appear since the event_time """ self.assertEqual( self.event_collection.find( { "name": event_name, "time": {"$gt": event_time}, "event.user_id": int(event_user_id), } ).count(), num_times_emitted )
def find(): conn=MongoClient("192.168.4.249",27017)['shulianxunying']['combine_v4_dimension']#修改源数据库地址 max_worktime=0 max_changefreq=0 max_experience=0 max_ability=0 min_refresh=10000 count=0 fd=open("log.txt",'w') fd.write("experience,changefreq,worktime,ability,refresh,uid\n") for cv in conn.find(timeout=False): fd.write(str(cv['experience'])+","+str(cv['changefreq'])+","+str(cv['worktime'])+","+str(cv['ability'])+","+str(cv['refresh'])+","+cv['uid']+"\n") ''' if cv['experience']>max_experience: max_experience=cv['experience'] if cv['experience']==23856: print "max_experience",cv['uid'] if cv['changefreq']>max_changefreq: max_changefreq=cv['changefreq'] if cv['changefreq']==17.1534701857: print "max_changefreq",cv['uid'] if cv['worktime']>max_worktime: max_worktime=cv['worktime'] if cv['worktime']==119280: print "worktime",cv['uid'] if cv['ability']>max_ability: max_ability=cv['ability'] if cv['refresh']>0 and cv['refresh']<min_refresh: min_refresh=cv['refresh'] if cv['refresh']==59: print "min_refresh",cv['uid'] print "max_worktime,max_changefreq,max_experience,max_ability,min_refresh" print max_worktime,max_changefreq,max_experience,max_ability,min_refresh ''' fd.close()
def get_sample_reviews(ratings=None, size=None): """ Function used to query the Mongo Collection for a particular rating and number of records and write the review text and rating to a JSON file. :param ratings: Review rating in the Mongo Collection :param size: Number of records to be retrieved :return: None """ combined_collection = MongoClient('localhost', 29017).yelp.RestaurantReviews file_name = "Ratings.json" ratings_file = open(file_name, 'a') if ratings and size: combined_cursor = combined_collection.find({ "stars": ratings }, { "text": 1, "stars": 1, "_id": 0 }).limit(size) for entry in combined_cursor: text = entry["text"] stars = entry["stars"] tmp = ' '.join(text.split()) obj = SampleEntry(tmp, stars) ratings_file.write(json.dumps(vars(obj))) ratings_file.write("\n") ratings_file.close()
def player(): logging.basicConfig(level=logging.INFO) with open(PID_FN, "w") as f: pid = getpid() f.write(str(pid)) coll = MongoClient()["for-music-player"].queue i = 0 while True: i += 1 if not coll.count_documents(filter=FILTER): print(f"> queue empty. wait {WAIT_SEC} sec...") # exit() sleep(WAIT_SEC) elif MAX_ITERATION_NUM >= 0 and i >= MAX_ITERATION_NUM: exit() else: objs = coll.find(filter=FILTER, sort=[("date", 1)]) obj = objs[0] f, fn = mkstemp(suffix=".mp3") coll.update_one({"_id": obj["_id"]}, {"$set": { "start": datetime.now() }}) myexec(f"wget \"{obj['path']}\" -O \"{fn}\"") myexec(f"{PLAY_AUDIO_COMMAND} \"{fn}\"") close(f) coll.update_one({"_id": obj["_id"]}, {"$set": { "played": True, "end": datetime.now() }})
def validate_all_human_protein(): # runs all proteins through the validator # and generates a log file coll = MongoClient().wikidata_src.mygene metadata_coll = MongoClient().wikidata_src.mygene_sources metadata = metadata_coll.find_one() doc_filter = {'taxid': 9606, 'entrezgene': {'$exists': True}} docs = coll.find(doc_filter) print("total number of records: {}".format(coll.find(doc_filter).count())) validate_type = 'eukaryotic' docs = HelperBot.validate_docs(docs, validate_type, 'P351') records = HelperBot.tag_mygene_docs(docs, metadata) _ = list(records)
def main(): col = MongoClient()["tubules"]["members"] with open("22nd feb.csv", "r") as in_file: data = csv.reader(in_file) for index, row in enumerate(data): print("{}/{}".format(index, N_ROWS)) scraped_data = gather_information(row) if scraped_data["success"]: r = col.insert_one(scraped_data["data"]) print(r.inserted_id) elif scraped_data["error_code"] == 1: print(scraped_data["message"]) print(scraped_data["data"]) in_db = [reg["Registration Number"] for reg in col.find({"Registration Number": {"$in": scraped_data["data"]}}, {"Registration Number": 1})] print(in_db) if len(in_db) < len(scraped_data["data"]): responses = multiple_regs([regs for regs in scraped_data["data"] if regs not in in_db]) try: r = col.insert_many([response["data"] for response in responses if response["success"]]) print(r.inserted_ids) except Exception as e: print(e.args) else: print("None saved to Database") else: print(scraped_data["message"])
def parent(): SLEEP = 10 p = MongoClient().client["MP"].p p.drop(); p.insert_one({"_id": 1}); p.insert_one({"_id": 2}) isParent = True newpid1 = os.fork() # We are the child if newpid1 == 0: isParent = False child1() p = MongoClient().client["MP"].p; p.remove({"_id": 1}) # We are the parent else: newpid2 = os.fork() # We are the child if newpid2 == 0: isParent = False child2() p = MongoClient().client["MP"].p; p.remove({"_id": 2}) if not isParent: print "PROCESS FINISHED" else: wait = True while wait: ps = p.find({}) wait = False if ps.count() == 0 else True if wait: print "MAIN PROCESS WAITING: %i" % ps.count() time.sleep(SLEEP) print "MAIN PROCESS FINISHED"
def run(host=None, db=None, coll=None, node=None, outgoing="true", incoming="true", undirected="true", offset=0, limit=0): # Connect to the mongo collection. graph = MongoClient(host)[db][coll] outgoing = json.loads(outgoing) incoming = json.loads(incoming) undirected = json.loads(undirected) offset = int(offset) limit = int(limit) # Construct the query according to the given options. query = {"type": "link"} clauses = [] oid = ObjectId(node) if outgoing or incoming: dirclauses = [] orclause = {"$or": [{"undirected": {"$not": {"$exists": 1}}}, {"undirected": False}]} if outgoing: dirclauses.append({"source": oid}) if incoming: dirclauses.append({"target": oid}) clauses.append({"$and": [orclause, {"$or": dirclauses}]}) if undirected: clauses.append({"$and": [{"undirected": True}, {"$or": [{"source": oid}, {"target": oid}]}]}) query["$or"] = clauses return bson.json_util.dumps(list(graph.find(query, skip=offset, limit=limit)))
def getDoubanBasic(doubanID): coll = MongoClient()[DB][DoubanBasic] cur = coll.find({'id': doubanID}) if cur.count() > 0: return cur[0] else: return None
def progress(db_url, session, tasks_per_stage): # Find the db_url if not db_url: try: db_url = os.environ['RADICAL_PILOT_DBURL'] except KeyError: rp = json.load(open(os.path.join(session, session+".json"))) db_url = rp['session']['cfg']['dburl'] db = db_url.split('/')[-1] collection = MongoClient(db_url)[db][session] cursor = collection.find() count = [(unit['state'] == 'DONE') for unit in cursor if unit['type'] == 'unit'] if len(count) == 0: click.echo('There are no units in the session.') return if tasks_per_stage == -1: tasks_per_stage = len(count) stage, completed = divmod(sum(count), tasks_per_stage) percentage = round(completed/tasks_per_stage * 100, 2) if sum(count) == len(count): # If all the tasks finished then the above gives incorrect result. stage -= 1 completed = tasks_per_stage percentage = 100 click.echo("Stage {} progress: {}/{} ({}%)".format(stage, completed, tasks_per_stage, percentage))
def retrieve_zones(): """Retrieve (geo)zones with a `flag` or a `blazon` value.""" zones = MongoClient().geozones.geozones return zones.find({'$or': [ {'flag': {'$exists': True}}, {'blazon': {'$exists': True}} ]})