def find(db: Database, match_time: bool = False): dups = db.aggregate([{ "$group": { "_id": "$hash", "total": { "$sum": 1 }, "file_size": { "$max": "$file_size" }, "items": { "$push": { "file_name": "$_id", "file_size": "$file_size", "image_size": "$image_size", "capture_time": "$capture_time" } } } }, { "$match": { "total": { "$gt": 1 } } }, { "$sort": { "file_size": -1 } }]) if match_time: dups = (d for d in dups if same_time(d)) return list(dups)
def getTweetsTimeSeries(self, mongoDbCollection : Database, stockTicker, fromDate, toDate): raw_tweets = mongoDbCollection.aggregate([ {"$match": {"stockTicker": stockTicker, "date": {"$gte": fromDate, "$lt": toDate}}}, {"$group": { "_id": {"day": {"$dayOfMonth": "$date"}, "month": {"$month": "$date"}, "year": {"$year": "$date"}}, "count": {"$sum": 1}}}]) raw_tweets = list(raw_tweets) date_count_tweets = [] # extract tweets to be in list of dictionaries of form date and tweets # TODO upgrade mongo DB to 3.6 to do a projection with $dateFromParts for tweet in raw_tweets: entry = {} entry['date'] = datetime(year=tweet["_id"]["year"], month=tweet["_id"]["month"], day=tweet["_id"]["day"]) entry['tweets'] = tweet['count'] date_count_tweets.append(entry) date_count_tweets = sorted(date_count_tweets, key=lambda k: k['date']) return date_count_tweets