def get_service_channel(self, lookup_by_page_ids=True): candidates = self.get_attached_service_channels() if not candidates and lookup_by_page_ids: # Fallback to lookup by token/page ids if self.facebook_access_token: candidates = FacebookServiceChannel.objects.find( account=self.account, facebook_access_token=self.facebook_access_token)[:] if not candidates: candidates = FacebookServiceChannel.objects.find( account=self.account, facebook_page_ids__in=self.facebook_page_ids)[:] if not candidates: return None if len(candidates) == 1: return candidates[0] else: LOGGER.error( "We have multiple candidates for service channel matching for enterprise channel %s" % self) return None if len(candidates) > 1: LOGGER.warn("We have multiple candidates for service channel matching " "for enterprise channel %s" % self) if candidates: return candidates[0]
def __get_channel_events(channel, user): tracked_pages = [page for page in channel.all_facebook_pages] if not tracked_pages: try: tracked_pages = facebook_driver.GraphAPI( channel.get_access_token(user), channel=channel).get_object('/me/accounts')['data'] except facebook.GraphAPIError as e: app.logger.error(e) events = [] for page in tracked_pages: token = page['access_token'] api = facebook_driver.GraphAPI(token) try: res = api.get_object(page['id'] + '/events') for itm in res['data']: itm['page_id'] = page['id'] itm['access_token'] = token itm['type'] = 'event' events.extend(res['data']) except facebook.GraphAPIError, e: if e.result['error']['code'] == 32: from solariat_bottle.settings import LOGGER LOGGER.warn("Page has hit rate limit: id=%s name=%s", page.get('id'), page.get('name')) continue raise e
def preprocess_post(self, event_json): if isinstance(event_json, (tuple, list)): message_type, data = event_json post_data = None preprocess = self.preprocessors.get(message_type) if preprocess is None: LOGGER.warn(u"Unknown message type: %s\nEvent is: %s" % (message_type, event_json)) return None try: post_data = preprocess(data) except: import traceback traceback.print_exc() LOGGER.warn(u"Error parsing tweet: %s" % unicode(event_json)) if post_data: return post_data else: LOGGER.info(u"Twitter event: %s" % unicode(event_json)) elif isinstance(event_json, dict): # already processed return event_json return None
def twitter_status_to_post_dict(data): base_url = 'https://twitter.com/%s/statuses/%s' if 'text' in data or 'full_text' in data: extended_tweet = data.get('extended_tweet') or {} is_retweet = 'retweeted_status' in data post_fields = {'twitter': {'_wrapped_data': json.dumps(data)}} post_fields['twitter'].update( _source=TweetSource.TWITTER_PUBLIC_STREAM, _is_retweet=is_retweet) if 'user' in data: author = data['user'] content = data.get('full_text') or extended_tweet.get( 'full_text') or data.get('text') else: LOGGER.warn(u"Mis-formatted twitter data %s" % data) return def _get_tweet_lang(data): if 'lang' in data and data['lang']: return data['lang'] elif 'lang' in author: return author['lang'] post_fields['twitter'].update( _is_manual_retweet=not is_retweet and content.startswith('RT')) post_fields['lang'] = _get_tweet_lang(data) post_fields['user_profile'] = parse_user_profile(author) post_fields['content'] = content post_fields['url'] = base_url % (author['screen_name'], data['id_str']) #twitter data used to link a post to possible thread post_fields['twitter'].update(TweetParser()(data)) return post_fields
def filter_ids(keys): " check that all keys type of int " _keys = [] for key in keys: try: _keys.append(str(int(key))) except ValueError: LOGGER.warn('incorrect id: %s', key) return _keys
def _construct_query(self, ignored_params=None, *args, **kwargs): """ Build the appropriate filter query for the model from the request parameters""" result = {} if ignored_params is None: # By default we want to skip reserved fields which are used for different purposes ignored_params = self.reserved_fields _fields = self.model.fields.keys() # Now just iterate parameters and construct the required query for filter_name, filter_value in kwargs.items(): if filter_name in ignored_params: continue parts = filter_name.split('__') if parts[0] not in _fields: LOGGER.warn("%s supposed to be filter but not in fields for %s, got %s" % (filter_name, self.model, str(kwargs))) continue result[str(filter_name)] = filter_value return result
def subscribe_to_app(server, callback_url): while not server.started: LOGGER.warn("Server not started, going to sleep") time.sleep(5) LOGGER.info("Subscribing to app") # Now subscribe to our app on facebook G = facebook_driver.GraphAPI(version='2.2') app_access_token = FACEBOOK_APP_ID + "|" + FACEBOOK_APP_SECRET path = FACEBOOK_APP_ID + "/subscriptions" post_args = { 'access_token': app_access_token, 'callback_url': callback_url, 'fields': 'feed', 'object': 'page', 'verify_token': 'token' } subs = G.request(G.version + "/" + path, post_args=post_args) if subs: print "Subscription response was: " + str(subs)
def mark_and_sweep_topics(channel_or_tag, time_slot, rank=None): ''' Given any time slot, this algorithm will remove all root topics that are not in the top list ''' # Reset everything with a counter. We use a random number generator so that # we have to be very specific on what to remove, to avoid accidents counter = random.randrange(MARKED_TO_KEEP + 1, BIGGEST_COUNTER_VALUE) update = mark_to_remove(channel_or_tag, time_slot, counter) initial_count = update['n'] #logger.debug("Reset %s items" % initial_count) # Now recursively mark items to keep marked = mark_items_to_keep(channel_or_tag, time_slot, rank=rank) #logger.debug("Marked %d items to keep" % marked) # Now remove what is left update = remove_records(counter) removed = update['n'] #logger.debug("Removed %d items" % removed) stats = initial_count, marked, removed if (initial_count > (marked + removed) + VERIFICATION_DELTA or initial_count < (marked + removed) - VERIFICATION_DELTA): msg_info = { "channel": channel_or_tag.title, "initial_count": initial_count, "marked": marked, "removed": removed, "real_sum": marked + removed } subject = """[!]checksum for topics FAILED during purging. Channel: %s""" % channel_or_tag.title body = MSG_TEMPLATE % msg_info send_notification_for_team(subject=subject, body=body) LOGGER.warn( "invalid checksum for topics:: channel: %s; %d initially, %d marked, and %d removed", channel_or_tag.title, *stats) return stats
def prepare(self, data, target_id, channel, driver): matched_channels = [channel] if not matched_channels: LOGGER.warn( 'Matched channels for %s EnterpriseFacebookChannel not found', channel.id) outbound_ids = [] if channel.facebook_handle_id: outbound_ids.append(str(channel.facebook_handle_id)) if channel.facebook_page_ids: outbound_ids.extend(channel.facebook_page_ids) post_fields = {} post_fields.update(self.handle(data, target_id)) sender_id = post_fields.pop('sender_id') self.update_with_channels(post_fields, sender_id, outbound_ids, matched_channels) self.update_with_user_profile(post_fields, sender_id, driver) return post_fields
def fetch_and_post(self): ch_id = self.subscription.channel.id for posts_processed, entry in enumerate(self._query_posts(), start=self.posts_processed + 1): if self.subscriber.stopped(): break post_fields = entry.solariat_post_data if not post_fields: LOGGER.warning('no post_fields in: %s', entry) self.flush_buffer() continue try: log_state(ch_id, get_post_natural_id(post_fields), PostState.REMOVED_FROM_WORKER_QUEUE) except KeyError: LOGGER.error('cannot get post id: %s', post_fields) channels = self._find_channels(post_fields) if channels: post_fields['channels'] = channels self.post_queue.put(post_fields) # blocked by queue maxsize self.posts_queued += 1 self._fetch_buffer.append(entry) else: LOGGER.warn("No channels found for queued post %s\n" "queue item id: %s" % (post_fields, entry.id)) self.posts_processed = posts_processed if posts_processed % self.UPDATE_PROGRESS_EVERY == 0: self.update_progress() self.subscriber.aggregate_state(self, {'running': self.progress}) self.flush_buffer() self.subscriber.update_status(SUBSCRIPTION_RUNNING)
def mark_and_sweep_trends(channel_or_tag, time_slot, topics): counter = random.randrange(MARKED_TO_KEEP + 1, BIGGEST_COUNTER_VALUE) marked_to_remove = trends_mark_to_remove(time_slot, channel_or_tag, counter) marked_to_keep = trends_mark_to_keep(time_slot, channel_or_tag, topics) remove_result = trends_remove(counter) initial_count, marked, removed = marked_to_remove['n'], marked_to_keep[ 'n'], remove_result['n'] if (initial_count > (marked + removed) + VERIFICATION_DELTA or initial_count < (marked + removed) - VERIFICATION_DELTA): msg_info = { "channel": channel_or_tag.title, "initial_count": initial_count, "marked": marked, "removed": removed, "real_sum": marked + removed } subject = """[!]checksum for trends FAILED during purging. Channel: %s""" % channel_or_tag.title body = MSG_TEMPLATE % msg_info send_notification_for_team(subject=subject, body=body) LOGGER.warn( "invalid checksum for trends:: channel: %s; %d initially, %d marked, and %d removed", channel_or_tag.title, initial_count, marked, removed) return initial_count, marked, removed
def get_timerange_level(self): try: return guess_timeslot_level(parse_datetime(self.filters['from']), parse_datetime(self.filters['to'])) except: LOGGER.warn('Unknown period to determine the timerange level')
def post_received(self, post_field): """ Expose post_received functionality mainly for testing purposes. Could also use it for loading post data directly through bot in case of historics / load_data scripts """ post_field = json.loads(post_field) print "Received " + str(post_field) if post_field['object'] == 'page': for entry in post_field['entry']: page_id = str(entry['id']) efc_channel = FacebookClient.get_facebook_channel(page_id) if not efc_channel: LOGGER.warn( 'Active EnterpriseFacebookChannel not found for page_id %s', page_id) break check_channel_token_valid(efc_channel) matched_chanels = FacebookClient.get_service_channels(page_id) if not matched_chanels: LOGGER.warn( 'Active FacebookServiceChannel not found for page_id %s', page_id) break G = facebook_driver.GraphAPI(efc_channel.facebook_access_token, channel=efc_channel) oubound_ids = [] if efc_channel.facebook_handle_id: oubound_ids.append(str(efc_channel.facebook_handle_id)) oubound_ids.extend( [str(u_id) for u_id in efc_channel.facebook_account_ids]) attachments_ids = [] for change in entry['changes']: if change['value']['item'] in ('photo', ): attachments_ids.append(change['value'].get( 'post_id', None)) for change in entry['changes']: post_fields = {} if change['field'] == 'feed': if change['value']['item'] == 'comment': post_fields.update( self.process_comment( G=G, page_id=page_id, change=change, attachments_ids=attachments_ids)) if change['value']['item'] == 'status': post_fields.update( self.process_status_update( G=G, page_id=page_id, change=change, attachments_ids=attachments_ids)) if change['value']['item'] in ('post', 'photo'): post_fields.update( self.process_post( G=G, page_id=page_id, change=change, attachments_ids=attachments_ids)) sender_id = post_fields.pop('sender_id', None) if change['value']['item'] in ('comment', 'status', 'post', 'photo'): service_chns = [] for candidate in matched_chanels: if (sender_id in oubound_ids or sender_id in candidate.facebook_page_ids or self.facebook_user_match( G, sender_id, efc_channel.facebook_handle_id)): # If the sender is the same as the authenticated user, it's an outbound post service_chns.append( str(candidate.outbound_channel.id)) else: # Otherwise we have an inbound post. service_chns.append( str(candidate.inbound_channel.id)) post_fields['channels'] = service_chns user_profile = G.get_object(sender_id) post_fields['user_profile'] = { 'platform_data': user_profile } if 'name' in user_profile: post_fields['user_profile'][ 'user_name'] = user_profile['name'] if 'username' in user_profile: post_fields['user_profile'][ 'id'] = user_profile['username'] picture = G.get_object(sender_id + '/picture') if picture and picture['data'] and not picture.get( 'is_silhouette', False): post_fields['user_profile'][ 'profile_image_url'] = picture['url'] if 'location' in user_profile: if isinstance(user_profile['location'], dict): if 'city' in user_profile['location']: post_fields['user_profile'][ 'location'] = user_profile[ 'location']['city'] elif 'country' in user_profile['location']: post_fields['user_profile'][ 'location'] = user_profile[ 'location']['country'] elif isinstance(user_profile['location'], basestring): post_fields['user_profile'][ 'location'] = user_profile['location'] print "Pushed " + str(post_fields) self.post_queue.put(post_fields)
def combine_and_split(channel_key_map=None, max_track=400, max_follow=5000, fetch_missing_profiles=True): user_name_to_id, missing_profiles = build_uname_to_id_map(channel_key_map) if missing_profiles and fetch_missing_profiles: LOGGER.info(u"Fetching missing profiles: %s" % missing_profiles) from solariat_bottle.utils.tweet import TwitterApiWrapper from solariat_bottle.utils.oauth import get_twitter_oauth_handler from solariat_bottle.db.user_profiles.user_profile import UserProfile from solariat_bottle.daemons.helpers import parse_user_profile try: api = TwitterApiWrapper.make_api(get_twitter_oauth_handler()) for user in api.lookup_users(screen_names=missing_profiles, include_entities=True): UserProfile.objects.upsert( 'Twitter', profile_data=parse_user_profile(user)) uname = user.screen_name.lower() user_name_to_id[uname] = str(user.id) missing_profiles.discard(uname) except: pass if missing_profiles: LOGGER.warn(u'Missing UserProfiles. ' u'User names won\'t be tracked: %s' % missing_profiles) # regroup by postfilter entry group_by_entry = defaultdict(lambda: { 'accounts': set(), 'channels': set() }) for channel, key_map in channel_key_map.iteritems(): for entry_type, entries in key_map.iteritems(): if entry_type == 'SKIPWORD': continue for entry in entries: entry_key = None if entry_type == 'USER_NAME': uid = user_name_to_id.get( preprocess_keyword(entry, strip_special_chars=True)) if not uid: LOGGER.warn(u'user id not found for %s' % entry) continue entry_key = ('USER_ID', uid) if entry_type == 'USER_ID' or entry_type == 'KEYWORD': updated_entry = preprocess_keyword(entry) if not updated_entry: LOGGER.warn('Skipped keyword %s' % entry) continue else: entry_key = (entry_type, updated_entry) group_by_entry[entry_key]['channels'].add(channel) group_by_entry[entry_key]['accounts'].add(channel.account) def _minimize(group_by_entry): """if there is keyword without [@#] then remove same keywords with @# and merge channels""" result = defaultdict(lambda: {'accounts': set(), 'channels': set()}) for item, channels_and_accounts in group_by_entry.iteritems(): filter_type, value = item if filter_type == 'KEYWORD': clean_value = value.lstrip('#').lstrip('@') key = (filter_type, clean_value) if key in group_by_entry: result[key]['accounts'].update( channels_and_accounts['accounts']) result[key]['channels'].update( channels_and_accounts['channels']) else: result[item] = channels_and_accounts else: result[item] = channels_and_accounts return result def _iter_parts(group_by_entry): from itertools import izip_longest, ifilter def _filter(entry_type): return ifilter( lambda ((filter_type, _1), _2): filter_type == entry_type, group_by_entry.iteritems()) for keywords_data, user_ids_data in izip_longest( partition(_filter('KEYWORD'), max_track), partition(_filter('USER_ID'), max_follow), fillvalue=[]): merged_channels = set() merged_accounts = set() keywords = [] user_ids = [] for item, accounts_and_channels in chain(keywords_data, user_ids_data): filter_type, value = item merged_channels.update(accounts_and_channels['channels']) merged_accounts.update(accounts_and_channels['accounts']) if filter_type == 'KEYWORD': keywords.append(value) elif filter_type == 'USER_ID': user_ids.append(value) yield keywords, user_ids, merged_accounts, merged_channels return list(_iter_parts(_minimize(group_by_entry)))
def execute_request(self, failfast=False): import time import datetime retry_attempts = 0 max_retry_attempts = self.config.get('retry_count', 10) min_delay = self.config.get('min_retry_delay', 60) # 1 minute incr = 2 max_delay = self.config.get('max_retry_delay', 1800) # 30 minutes delay = min_delay method = getattr(self.api, self.method) exc = None _start = datetime.datetime.utcnow() while retry_attempts < max_retry_attempts: try: params = self.get_method_params() if not params: LOGGER.warn(u"%s.%s got no params. Filters were: %s" % (self.api, self.method, self.filters)) self.result = [] else: LOGGER.info( u"Executing %s.%s with params: %s\nFilters: %s" % (self.api, self.method, dumps(params), dumps(self.filters))) self.result = self.parse_response(method(**params)) self.filtered_result = filter(self._filter_tweet, self.result) self.filters.update( max_id=self.max_id) # update filters with next max_id except TweepError as e: exc = e # non-rate-limit error during performing request or parsing response; # rate-limit errors with 420, 429 statuses are handled by tweepy LOGGER.error(e, exc_info=True) # search api may respond with {"error": "Sorry, your query is too complex. Please reduce complexity and try again."} if "query is too complex" in unicode(e): break retry_attempts += 1 time.sleep(delay) delay = min(max_delay, delay * incr) except TwitterApiRateLimitError as e: LOGGER.debug( '[execute_request] rate limits, waiting %s seconds', e.wait_for) self.subscriber.aggregate_state( self, {'wait_rate_limit_reset': e.wait_for}) raise else: LOGGER.debug('[execute_request] len(self.result)=%s', len(self.result)) if len(self.result) == 0 or self.filters_fulfilled(): self._done = True self.subscriber.aggregate_state(self, {'finished': True}) else: self.subscriber.aggregate_state(self, {'running': self.progress}) break _elapsed = datetime.datetime.utcnow() - _start if exc is not None: LOGGER.error(u"Could not retrieve results from twitter after %s" % _elapsed) self.subscriber.aggregate_state(self, {'failed': str(exc)}) if failfast is True: raise exc else: self.result = [] self._done = True
def sync(self): if self._in_use: LOGGER.warn(u"Sync auth pool while in use {}".format(self._in_use)) self._in_use = {} for auth in AuthPool.get_auth_pool(): self.put(auth)