def __init__(self, region, start_timestamp, end_timestamp, freq="1h"): # super(InstagramTimeSeries, self).__init__(region,start_timestamp, end_timestamp, freq) self.start_timestamp = start_timestamp self.end_timestamp = end_timestamp self.region = region self._db = MongoDBInterface() self._db.setDB(InstagramConfig.db) self._db.setCollection(InstagramConfig.posts_collection) self.days_to_predict = 1 self.freq = freq
class InstagramTimeSeries: def __init__(self, region, start_timestamp, end_timestamp, freq="1h"): # super(InstagramTimeSeries, self).__init__(region,start_timestamp, end_timestamp, freq) self.start_timestamp = start_timestamp self.end_timestamp = end_timestamp self.region = region self._db = MongoDBInterface() self._db.setDB(InstagramConfig.db) self._db.setCollection(InstagramConfig.posts_collection) self.days_to_predict = 1 self.freq = freq def rangeQuery(self, region, startTimestamp, endTimestamp): region_conditions = {} period_conditions = {} if region: region_conditions = {"region.code": region} period_conditions = {"created_time": {"$gte": startTimestamp, "$lt": endTimestamp}} conditions = dict(region_conditions, **period_conditions) return self._db.getAllDocuments(conditions).sort([("created_time", -1)]) def getRawSeries(self): return self.series def buildTimeSeries(self, count_people=True, avoid_flooding=True): """Return a pandas Series object count_people = True means we only want to count single user instead of # of photos for that region avoid_flooding = True means we want to avoid a single user flooding many photos into instagram in a short time. Now we set the time window as within 5 minutes only count as a single user """ window_avoid_flooding = 300 data = [] photo_cnt = 0 for photo in self.rangeQuery(self.region, self.start_timestamp, self.end_timestamp): p = {"user": photo["user"], "created_time": photo["created_time"]} data.append(p) photo_cnt += 1 if photo_cnt % 10000 == 0: print photo_cnt data = sorted(data, key=lambda x: x["created_time"]) print (len(data)) user_last_upload = {} # for a single user, when is his last upload counts = [] dates = [] counts.append(1) # VERY IMPORTANT. FIX THE SIZE OF TIMESERIES IN PANDAS dates.append(datetime.utcfromtimestamp(float(self.start_timestamp))) for photo_json in data: user = photo_json["user"]["username"] utc_date = datetime.utcfromtimestamp(float(photo_json["created_time"])) if count_people: if user not in user_last_upload: user_last_upload[user] = int(photo_json["created_time"]) dates.append(utc_date) counts.append(1) else: if float(photo_json["created_time"]) - float(user_last_upload[user]) > window_avoid_flooding: user_last_upload[user] = int(photo_json["created_time"]) dates.append(utc_date) counts.append(1) else: dates.append(utc_date) counts.append(1) counts.append(1) # VERY IMPORTANT, FIX THE SIZE OF TIMESERIES IN PANDAS dates.append(datetime.utcfromtimestamp(float(self.end_timestamp) - 1)) self.series = Series(counts, index=dates) print (self.series.count()) try: self.series2 = self.series.resample(self.freq, how="sum", label="right") # self.series2 = self.series2.fillna(0) #fill NaN values with zeros except Exception as e: # not enough data print (e) pass print (self.series2.count()) return self.series2 def smoothSeriesEwma(self, series, span=5.0, adjust=True, halflife=None, min_periods=0, how="mean"): return pandas.ewma( series, com=None, span=span, halflife=halflife, min_periods=min_periods, freq="1h", adjust=adjust, how=how, ignore_na=True, ) def smoothSeriesEwmstd(self, series, span=5.0, adjust=True, halflife=None, min_periods=0): return pandas.ewmstd( series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True ) def smoothSeriesEwmvar(self, series, span=5.0, adjust=True, halflife=None, min_periods=0): return pandas.ewmstd( series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True ) def dataPrepare(self, serie): """This is to return the 'future data points' that you want to predict. e.g. predict for each hour tomorrow how many people will show up at Times Square """ ts = serie index = ts.index if len(index) < 3: raise Exception("Only %d data points" % (len(index))) start_date = ts.index[0] """Notice training here is in the format of (days from begining of the timeseries, number of data at that time) """ training = [] for idx in index: days_diff = (idx - start_date).days + (idx - start_date).seconds / (24 * 3600.0) training.append((days_diff, ts[idx])) nearest_current_date = index[-1] testing = [] align = [] converted_align = [] for hour in range(25 * self.days_to_predict): next_date = nearest_current_date + timedelta(seconds=3600 * (hour + 1)) delta = next_date - start_date days_from_start = (delta.seconds + delta.days * 86400) / (3600 * 24.0) testing.append(days_from_start) align.append(next_date) converted_align.append(calendar.timegm(next_date.utctimetuple())) return training, testing, align, converted_align