def map_func(status): status_obj = dict(status) status_obj['id'] = unicode(status_obj['id']) status_obj['datetime'] = datetimes.extract_datetime( status_obj['datetime']) status_obj.update({'query': query, 'platform': platform.title()}) return status_obj
def map_func(status): status_obj = dict(status) status_obj['id'] = unicode(status_obj['id']) status_obj['datetime'] = datetimes.extract_datetime( status_obj['datetime']) status_obj.update({'query': query}) return status_obj
def get_posts(self): for post in self._get_post(): if (post['text']['type'] == 'comment' or post['text']['event'] != 'add'): logger.debug("Discard a comment for keyword <%s>" % post['match_info']['keyword']) continue #TODO: parse comments keyword = post['match_info']['keyword'] status = post['text']['status'] t = { 'id': status['mid'], 'datetime': datetimes.extract_datetime(status['created_at']), 'username': status['user']['name'], 'uid': status['user']['id'], 'text': status['text'], 'shares': status['reposts_count'], 'replies': status['comments_count'], 'uri': status['statusurl'].replace("http://", ""), 'reach': status['user']['followers_count'], 'gender': status['user']['gender'], 'platform': 'sina weibo' } try: t['location'] = self._get_location( status['user']['city_name'], status['user']['province_name'], status['user']['city_coordinates']) except IndexError: # empty coordinates pass logger.debug("Got 1 post for keyword <%s>" % keyword) yield (keyword, t)
def fetch_historic_platform(query, platform): subscription = Subscription(query) manager = SubscriptionManager() if not subscription: logger.warning( u"%s not found in the subscription list. (Maybe have been deleted.)" % query) return if subscription.has_historic_data(platform): logger.debug(u"Already fetched historic from %s for %s. Skipping..." % (platform, query)) return manager.mark_earliest_datetime(query, datetimes.now(), platform) # lock the subscription results = active_platforms[platform]().search( subscription.get_query_obj(), historic=True, age_filter=get_age_filter(subscription.get_created_datetime(), True)) logger.info("Pushing results to SQS...") push(results) logger.info("Updating earlist_datetime...") e_datetime = None for d in [r['datetime'] for r in results]: if not e_datetime: e_datetime = d continue if e_datetime > d: e_datetime = d if e_datetime: manager.mark_earliest_datetime(query, datetimes.extract_datetime(e_datetime), platform) logger.info(u"Finished fetching historic %s from %s" % (query, platform))
def fetch_platform(query, platform): subscription = Subscription(query) manager = SubscriptionManager() logger.debug(u"Received request to fetch new posts for %s" % query) if not subscription.ready_for_next(platform): logger.debug(u"Not ready for next fetch(%s, %s). Skipping..." % (query, platform)) return manager.mark_next_query_datetime(query, platform) # lock the subscription results = active_platforms[platform]().search( subscription.get_query_obj(), historic=False, age_filter=get_age_filter(subscription.get_latest_datetime(platform), historic=False)) logger.debug("Pushing results to SQS...") push(results) logger.debug("Updating latest_datetime...") l_datetime = None for d in [r['datetime'] for r in results]: if not l_datetime: l_datetime = d continue if l_datetime < d: l_datetime = d if l_datetime: manager.mark_latest_datetime(query, datetimes.extract_datetime(l_datetime), platform) logger.debug("Updating next_query_datetime...") manager.mark_next_query_datetime( query, platform, active_platforms[platform]().next_query_time(len(results))) logger.info(u"Finished fetching %s from %s" % (query, platform))
def is_older(p_datetime_str): p_datetime = datetimes.extract_datetime(p_datetime_str) # logger.debug("%s, %s" % (p_datetime.isoformat(), (baseline_datetime - datetime.timedelta(minutes=5)).isoformat())) return p_datetime < (baseline_datetime - datetime.timedelta( minutes=5)) if p_datetime else False
def is_newer(p_datetime_str): p_datetime = datetimes.extract_datetime(p_datetime_str) return p_datetime >= baseline_datetime if p_datetime else False