def main(): environment = defaults.get_environment() db = DB(environment, today()) _ = FindBotsBehaviour(environment, db) db.disconnect()
def main(): # parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.') # parser.add_argument('date', metavar='yyyy-mm-dd', # help='the date to process') # # args = parser.parse_args() environment = defaults.get_environment() db = DB(environment, today()) db_summary = DBSummary(environment) api = TwitterAPI(environment, db_summary) commands = db.get_commands(screen_names=db_summary.get_account_screen_names()) processed_commands = db_summary.get_processed_commands(since_id=db.get_baseline_tweet_id()) for command in commands: if command.id in processed_commands: logger.info(f'Skipping {command.id}. Already processed: {command.text}') else: m = re.match('\+([a-zA-Z0-9_]+) ([A-Z][AB]?)( t ([0-9]+))?( dl ([0-9]+))?', command.text) if m: screen_name = m.group(1) category = m.group(2) rt_threshold = m.group(4) rt_daily_limit = m.group(6) db.set_tweeter_category(screen_name=screen_name, category=category, rt_threshold=rt_threshold, rt_daily_limit=rt_daily_limit) status_text = f'+{screen_name} set to {category}' if rt_threshold is not None: status_text += f' rt threshold {rt_threshold}' if rt_daily_limit is not None: status_text += f' dl {rt_daily_limit}' save_command(command, status_text, db_summary, api.polling_api()) elif command.text.lower()[:5] == 'add #': tag_name = command.text[5:] logger.info(f'Adding {tag_name}') call('python3.7 words.py ' + tag_name, shell=True) tag = db.get_tag_ranges(tag=f'#{tag_name}', min_override=db.get_baseline_tweet_id()) print(tag.name_scores) name_score = tag.name_scores[-2] if len(tag.name_scores) > 1 else None score_text = '{} / {} = {:.1f}'.format(name_score.total_score, name_score.status_count, name_score.total_score / max(name_score.status_count, 1) ) if name_score is not None else '' status_text = f'-{tag_name} added. {score_text} {tag.state}' save_command(command, status_text, db_summary, api.polling_api()) else: if command.text[:2] not in ('To', 'RT'): logger.info(f'Unknown command {command.id}: {command.text}') db_summary.disconnect() db.disconnect()
def main(): environment = defaults.get_environment() db = DB(environment, today()) db_summary = DBSummary(environment) tl = TwitterList(environment, db, db_summary) tl.add_to_lists() tl.remove_from_lists() db.disconnect()
def main(): parser = argparse.ArgumentParser( description='Find bots based on activity on the given day.') parser.add_argument('date', metavar='yyyy-mm-dd', help='the date to process') args = parser.parse_args() environment = defaults.get_environment() db = DB(environment, args.date) date_to = args.date _ = FindBots(db, date_to) db.disconnect()
def main(): # parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.') # parser.add_argument('date', metavar='yyyy-mm-dd', # help='the date to process') # # args = parser.parse_args() environment = defaults.get_environment() db = DB(environment, today()) _ = DraftTrends(environment, db)
def main(): environment = defaults.get_environment() db = DB(environment, today()) start_date = str(datetime.date.today() - timedelta(days=7)) end_date = str(datetime.date.today() - timedelta(days=1)) logger.info(f'Dates: {start_date} {end_date}') words = {} rows = db.get_top_hashtags(start_date, end_date) for row in rows: words[row[0].lower()] = row[0] logger.info(f'{len(words)} words') for word, hashtag in words.items(): db.set_word_hashtag(word, hashtag) logger.debug(f'{word:>30} {hashtag}') db.disconnect()
def __init__(self, environment, hashtag, tags_list): self.env = environment self.hashtag = hashtag self.boring_words = {} self.banned_tags = {} self.data = {} self.t_new = 0 self.t_foreign = 0 self.t_skip = 0 self.t_retweet = 0 # self.ns_tweet_count = [] # self.ns_total_score = [] self.ns_score_log = [] # self.ns_index = -1 self.retweets = [] self.date = today() self.db = DB(environment, self.date) # self.c = self.db.connect(self.date) self.db_summary = DBSummary(environment) self.load_metadata() self.CONSUMER_KEY = self.env.consumer_key self.CONSUMER_SECRET = self.env.consumer_secret self.current_token = -1 self.hash_tags_re = re.compile(r'(?i)(?<!\w)#[\w\u064b-\u0657]+', re.UNICODE) self.twitters = list() for token in self.db_summary.get_all_tokens(): api = twitter.Api(consumer_key=self.CONSUMER_KEY, consumer_secret=self.CONSUMER_SECRET, access_token_key=token.key, access_token_secret=token.secret, sleep_on_rate_limit=True) self.twitters.append(api) self.today_skey = 0 self.score_names = False if hashtag == 'trends': if os.path.isfile('metadata/name_score.csv'): self.score_names = True logger.info( "metadata/name_score.csv will be used for name scoring.") else: logger.info( "Warning: metadata/name_score.csv does not exist so name scoring is disabled." ) self.batch_id = self.db.get_next_batch_id() self.baseline_tweet_id = self.db.get_baseline_tweet_id() self.today_skey = self.db.get_date_skey(self.date) self.loop_pos = -1 self.all_trends = None if hashtag == 'trends': self.all_trends = self.db.get_trends() self.loop_pos = 0 if tags_list is None: tags_list = [] for (tag, result) in self.all_trends.items(): if result in ('AUTO_ADD', 'MAN_ADD'): tags_list.append({'tag': tag}) orig_tags_list = tags_list tags_list = [] for tagdata in orig_tags_list: tags_list.append( self.db.get_tag_ranges(tagdata['tag'], self.baseline_tweet_id)) print('Tags_list:', tags_list) self.pull_trends(tags_list) self.write_data() elif hashtag == 'home_timeline': status_count = self.pull_data(hashtag) logger.info('{} statuses pulled.'.format(status_count)) self.write_data() elif hashtag == 'lists': lists = self.twitters[ self.db_summary.polling_token_index].GetLists( screen_name=self.env.polling_account) logger.info('{} lists for account {}.'.format( len(lists), self.env.polling_account)) for l in lists: status_count = self.pull_data(l.slug) logger.info('{} statuses pulled for list {}.'.format( status_count, l.slug)) self.write_data() self.db.disconnect()
class Words: def __init__(self, environment, hashtag, tags_list): self.env = environment self.hashtag = hashtag self.boring_words = {} self.banned_tags = {} self.data = {} self.t_new = 0 self.t_foreign = 0 self.t_skip = 0 self.t_retweet = 0 # self.ns_tweet_count = [] # self.ns_total_score = [] self.ns_score_log = [] # self.ns_index = -1 self.retweets = [] self.date = today() self.db = DB(environment, self.date) # self.c = self.db.connect(self.date) self.db_summary = DBSummary(environment) self.load_metadata() self.CONSUMER_KEY = self.env.consumer_key self.CONSUMER_SECRET = self.env.consumer_secret self.current_token = -1 self.hash_tags_re = re.compile(r'(?i)(?<!\w)#[\w\u064b-\u0657]+', re.UNICODE) self.twitters = list() for token in self.db_summary.get_all_tokens(): api = twitter.Api(consumer_key=self.CONSUMER_KEY, consumer_secret=self.CONSUMER_SECRET, access_token_key=token.key, access_token_secret=token.secret, sleep_on_rate_limit=True) self.twitters.append(api) self.today_skey = 0 self.score_names = False if hashtag == 'trends': if os.path.isfile('metadata/name_score.csv'): self.score_names = True logger.info( "metadata/name_score.csv will be used for name scoring.") else: logger.info( "Warning: metadata/name_score.csv does not exist so name scoring is disabled." ) self.batch_id = self.db.get_next_batch_id() self.baseline_tweet_id = self.db.get_baseline_tweet_id() self.today_skey = self.db.get_date_skey(self.date) self.loop_pos = -1 self.all_trends = None if hashtag == 'trends': self.all_trends = self.db.get_trends() self.loop_pos = 0 if tags_list is None: tags_list = [] for (tag, result) in self.all_trends.items(): if result in ('AUTO_ADD', 'MAN_ADD'): tags_list.append({'tag': tag}) orig_tags_list = tags_list tags_list = [] for tagdata in orig_tags_list: tags_list.append( self.db.get_tag_ranges(tagdata['tag'], self.baseline_tweet_id)) print('Tags_list:', tags_list) self.pull_trends(tags_list) self.write_data() elif hashtag == 'home_timeline': status_count = self.pull_data(hashtag) logger.info('{} statuses pulled.'.format(status_count)) self.write_data() elif hashtag == 'lists': lists = self.twitters[ self.db_summary.polling_token_index].GetLists( screen_name=self.env.polling_account) logger.info('{} lists for account {}.'.format( len(lists), self.env.polling_account)) for l in lists: status_count = self.pull_data(l.slug) logger.info('{} statuses pulled for list {}.'.format( status_count, l.slug)) self.write_data() self.db.disconnect() @property def api(self): self.current_token += 1 if self.current_token >= len(self.twitters): self.current_token = 0 return self.twitters[self.current_token] def pull_trend(self, trend, trend_count, trend_position): self.ns_score_log = [] status_count = 0 request_count = 0 index = 0 while index < len(trend.ranges): # for id_range in trend.ranges: id_range = trend.ranges[index] if not id_range.processed: max_id = id_range.max_id since_id = id_range.min_id statuses = None logger.info('Range: {:>9} {:35} {:20} {:20}'.format( '{:4d}/{:4d}'.format(trend_position, trend_count), trend.name, since_id, 'None' if max_id is None else max_id)) while statuses is None or len(statuses) >= 50: if request_count >= 100: new_range = Range(min_id=since_id, max_id=max_id) id_range.min_id = max_id trend.ranges.insert(index + 1, new_range) return request_count, status_count, False statuses = self.api.GetSearch(term=trend.name, result_type='recent', count=100, include_entities=False, max_id=max_id, since_id=since_id) self.get_words(statuses, trend=trend, source=trend.name) status_count += len(statuses) score = trend.get_average_score(10) if len(statuses) > 0: max_id = statuses[-1].id - 1 if id_range.max_id is None and len(statuses) > 0: id_range.max_id = statuses[0].id id_range.processed = True request_count += 1 logger.info( '{:40} {:20} {:20} {:3} {:5} {:5.2f} {}'.format( trend.name, since_id, 'None' if max_id is None else max_id, request_count, trend.get_status_count(10), score, trend.state)) if score < 0.0 and status_count > 150: trend.state = 'AUTO_DEL' self.save_score_log(self.ns_score_log, trend.name, 'Negative') id_range.min_id = max_id return request_count, status_count, True if score < 0.5 and status_count > 1000: trend.state = 'AUTO_DEL' self.save_score_log(self.ns_score_log, trend.name, 'Hot_Ambiguous') id_range.min_id = max_id return request_count, status_count, True if score > 2.0: if trend.state not in ('AUTO_ADD', 'MAN_ADD'): trend.state = 'AUTO_ADD' # after 500 tweets if we still haven't got an indication, give up if trend.get_status_count( 10) > 500 and trend.state == 'AUTO_DEL': self.save_score_log(self.ns_score_log, trend.name, 'Ambiguous') id_range.min_id = max_id return request_count, status_count, True # Not needed for raspberry pi if request_count % 100 == 0: logger.info( f'Sleeping 20 seconds at {request_count} requests.' ) sleep(20) index += 1 return request_count, status_count, True def pull_trends(self, trends): total_status_count = 0 last_write = 0 total_request_count = 0 trend_count = len(trends) for i, trend in enumerate(trends): completed = False while not completed: request_count, status_count, completed = self.pull_trend( trend, trend_count, i + 1) total_request_count += request_count total_status_count += status_count self.db.tag_history.append(trend) if total_request_count >= last_write + 20: self.write_data() self.batch_id += 1 last_write = total_request_count return total_status_count def load_metadata(self): f = open('metadata/boring.txt', 'r') for line in f: self.boring_words[line.rstrip()] = 1 f.close() f = open('metadata/banned_tags.txt', 'r') for line in f: self.banned_tags[line.rstrip()] = 1 f.close() def cleanup_exit(self): logger.info("Deleting batch %i" % self.batch_id) self.db.delete_batch(self.batch_id) self.db.disconnect() exit(1) def twitter_search(self, q, sinceid, maxid): self.current_token += 1 if self.current_token >= len(self.twitters): self.current_token = 0 if maxid is None: result = self.twitters[self.current_token].GetSearch( term=q, result_type='recent', count='100', include_entities='false', since_id=sinceid) else: result = self.twitters[self.current_token].GetSearch( term=q, result_type='recent', count='100', include_entities='false', since_id=sinceid, max_id=maxid) return result # def oauthReq(self, url, key, secret, http_method="GET", post_body='', # http_headers=None): # consumer = oauth.Consumer(key=self.CONSUMER_KEY, secret=self.CONSUMER_SECRET) # if self.hashtag == 'home_timeline': # token = self.db.getDefaultToken() # else: # token = self.db.getNextToken() # client = oauth.Client(consumer, token) # resp, content = client.request( # url, # method=http_method, # body=post_body, # headers=http_headers # , force_auth_header=True # ) # # print "*** %s ***" % content # # exit() # return content def write_data(self): logger.info("Writing data.") for tweetdate, stats in self.data.items(): logger.info("Saving data for %s." % tweetdate) for tag, words in self.data[tweetdate]['tag_words'].items(): for i, v in words.items(): self.db.write_hashtag_word(tweetdate, tag, i, v) for tweeter, words in self.data[tweetdate]['tweeter_words'].items( ): for i, v in words.items(): self.db.write_tweeter_word(tweetdate, tweeter, i, v) for tweeter, words in self.data[tweetdate][ 'tweeter_mentions'].items(): for i, v in words.items(): self.db.write_tweeter_mention(tweetdate, tweeter, i, v) for tag, count in self.data[tweetdate]['tags'].items(): self.db.write_tag(tweetdate, tag, count) for tag, tweeters in self.data[tweetdate]['tag_tweeters'].items(): for i, v in tweeters.items(): self.db.write_tag_tweeter(tweetdate, tag, i, v) for tag, tags in self.data[tweetdate]['tag_tags'].items(): for i, v in tags.items(): self.db.write_tag_tag(tweetdate, tag, i, v) self.db.write_tweets() self.db.write_tag_history() self.db.commit() logger.info("Data saved.") self.data = {} @staticmethod def save_score_log(score_log, trend, reject_reason): filename = "log/reject_%s_%s_%s.log" % (trend, reject_reason, file_timestamp()) secommon.save_list(score_log, filename) def calculate_name_score(self, status, trend): tweeter = status.user.screen_name tweeter_name = status.user.name score_candidate = (tweeter_name if status.retweeted_status is None else status.retweeted_status.user.name) score_candidate_sn = (tweeter if status.retweeted_status is None else status.retweeted_status.user.screen_name) trend.name_scores[-1].status_count += 1 # self.ns_tweet_count[self.ns_index] += 1 name_score = self.db.get_name_score(score_candidate, score_candidate_sn, status.user.location, status.user.time_zone) # self.ns_total_score[self.ns_index] += name_score trend.name_scores[-1].total_score += name_score score3 = '{:.2f}'.format(trend.get_average_score(3)) score6 = '{:.2f}'.format(trend.get_average_score(6)) self.ns_score_log.append([ score_candidate, score_candidate_sn, trend.name_scores[-1].status_count, name_score, trend.name_scores[-1].total_score, score3, score6, tweeter_name, tweeter, status.id ]) def process_status_words(self, status_id, status_date, status_text, tweeter): if status_date not in self.data: self.data[status_date] = {} self.data[status_date]['tweeter_mentions'] = {} self.data[status_date]['tag_words'] = {} self.data[status_date]['tweeter_words'] = {} self.data[status_date]['tags'] = {} self.data[status_date]['tag_tweeters'] = {} self.data[status_date]['tag_tags'] = {} # get all relevant hashtags relevant_hashtags = re.findall(r'(?<![A-Za-z0-9_])#([A-Za-z0-9_]+)', status_text.lower()) tweet_hashtags = set(self.hash_tags_re.findall(status_text)) tweet_tags = [ht[1:] for ht in tweet_hashtags] for tag in tweet_tags: if tag in self.data[status_date]['tags']: self.data[status_date]['tags'][tag] += 1 else: self.data[status_date]['tags'][tag] = 1 if tag not in self.data[status_date]['tag_tweeters']: self.data[status_date]['tag_tweeters'][tag] = {} if tweeter in self.data[status_date]['tag_tweeters'][tag]: self.data[status_date]['tag_tweeters'][tag][tweeter] += 1 else: self.data[status_date]['tag_tweeters'][tag][tweeter] = 1 if tag not in self.data[status_date]['tag_tags']: self.data[status_date]['tag_tags'][tag] = {} for tag2 in tweet_tags: if tag2 != tag: if tag2 in self.data[status_date]['tag_tags'][tag]: self.data[status_date]['tag_tags'][tag][tag2] += 1 else: self.data[status_date]['tag_tags'][tag][tag2] = 1 # remove links text = re.sub(r"(?<![A-Za-z0-9_])https?://[^ ,;'()\[\]<>{}]+", '', status_text, flags=re.IGNORECASE) alist = re.split('[, .;\'\"(){\}\[\]<>:?/=+\\\`~!#^&*\\r\\n\-]+', text) tweetwords = list() for item in alist: nitem = item.strip(' ,.-+()[]:\'\"').lower() if u"\u2026" in nitem: # ignore words truncated with ellipsis (...) continue if nitem == '': continue if nitem in self.boring_words: continue if nitem[:1] == '@' and len(nitem) > 2: # Tweeter mentions if tweeter not in self.data[status_date]['tweeter_mentions']: self.data[status_date]['tweeter_mentions'][tweeter] = {} if nitem[1:] in self.data[status_date]['tweeter_mentions'][ tweeter]: self.data[status_date]['tweeter_mentions'][tweeter][ nitem[1:]] += 1 else: self.data[status_date]['tweeter_mentions'][tweeter][ nitem[1:]] = 1 continue tweetwords.append(nitem) for tag in relevant_hashtags: if tag not in self.data[status_date]['tag_words']: self.data[status_date]['tag_words'][tag] = {} if nitem in self.data[status_date]['tag_words'][tag]: self.data[status_date]['tag_words'][tag][nitem] += 1 else: self.data[status_date]['tag_words'][tag][nitem] = 1 # Tweeter words if tweeter.lower() in self.db.rated_tweeters: if tweeter not in self.data[status_date]['tweeter_words']: self.data[status_date]['tweeter_words'][tweeter] = {} if nitem in self.data[status_date]['tweeter_words'][tweeter]: self.data[status_date]['tweeter_words'][tweeter][ nitem] += 1 else: self.data[status_date]['tweeter_words'][tweeter][nitem] = 1 tweet_words_text = u'~' + u'~'.join([ self.db.get_word_skey(x, self.date)[1] for x in sorted(set(tweetwords)) ]) + u'~' self.db.update_tweet_words(status_id, tweet_words_text) def get_words(self, statuses, trend=None, source=None): # max_id = 0 # min_id = MAX_STATUS_ID for status in statuses: # max_id = max(status.id, max_id) # min_id = min(status.id, min_id) tweeter = status.user.screen_name tweeter_name = status.user.name tweeter_created_at = self.env.get_local_date( status.user.created_at) tweeter_skey = self.db.get_tweeter_skey( screen_name=tweeter, name=tweeter_name, followers_count=status.user.followers_count, friends_count=status.user.friends_count, lang=status.user.lang, time_zone=status.user.time_zone, verified=status.user.verified, statuses_count=status.user.statuses_count, profile_image_url=status.user.profile_image_url, created_at=tweeter_created_at, location=status.user.location) tweet_text = status.text retweet_id = status.retweeted_status.id if status.retweeted_status is not None else 0 if retweet_id != 0: tweet_text = "RT " + status.retweeted_status.user.screen_name + ": " + \ status.retweeted_status.text if self.score_names: self.calculate_name_score(status, trend) if status.user.followers_count > 0 and tweeter.lower( ) in self.db.rated_tweeters: self.db.write_daily_followers(tweeter_skey, self.today_skey, status.user.followers_count) retweet_created_at = '' retweet_screen_name = '' retweet_count = status.retweet_count if status.retweeted_status is not None: self.retweets.append(status.retweeted_status) retweet_created_at = self.env.get_local_timestamp( status.retweeted_status.created_at) retweet_screen_name = status.retweeted_status.user.screen_name # check if duplicate and insert if not duplicate status_date = self.env.get_local_date(status.created_at) status_created_at = self.env.get_local_timestamp(status.created_at) date_skey = self.db.get_date_skey(status_date) if self.db.tweet_is_duplicate( id_=status.id, created_at=status_created_at, screen_name=status.user.screen_name, text=tweet_text, tweeter_skey=tweeter_skey, retweet_count=retweet_count, in_reply_to_status_id=status.in_reply_to_status_id, date_skey=date_skey, retweet_id=retweet_id, retweet_created_at=retweet_created_at, retweet_screen_name=retweet_screen_name, batch_id=self.batch_id, source=source): self.t_skip += 1 continue self.process_status_words(status_id=status.id, status_date=status_date, status_text=tweet_text, tweeter=tweeter) # @timeout(7) def pull_data(self, list_name): since_id = self.db.get_baseline_tweet_id() max_id = None all_statuses = [] statuses = None while statuses is None or len(statuses) > 0: if list_name == 'home_timeline': statuses = self.twitters[ self.db_summary.default_token_index].GetHomeTimeline( count=200, since_id=since_id, max_id=max_id, include_entities=False) else: statuses = self.twitters[ self.db_summary.polling_token_index].GetListTimeline( owner_screen_name=self.env.polling_account, slug=list_name, count=200, since_id=since_id, max_id=max_id, include_entities=False) if len(statuses) > 0: self.get_words(statuses, source=list_name) all_statuses.extend(statuses) max_id = statuses[-1].id - 1 logger.info('{} {}'.format(statuses[-1].id, len(statuses))) if len(all_statuses) > 0: max_id = max([status.id for status in all_statuses]) min_id = min([status.id for status in all_statuses]) self.db.write_list_max_id(list_name, max_id, min_id) return len(all_statuses)
def main(): env = defaults.get_environment() db = DB(env, today()) # db = DB(env, '2018-12-25') promotion = Promotion() demotedate_c = (datetime.date.today() - timedelta(days=30)).strftime('%Y-%m-%d') demotedate_d = (datetime.date.today() - timedelta(days=90)).strftime('%Y-%m-%d') # Promote to C logger.info("Tweeter Promotion %s" % today()) rows = db.get_tweeter_promotion_stats() # If a person has more than POWER_TWEEP followers, then mark it as F if it is negative - cannot do it for all # tweeps because # would get too many category F's. Don't want to waste resources storing Tweeps we may never encounter in future. logger.info(f'Starting loop of {len(rows)} records.') row_count = len(rows) current_row = 0 for screen_name, pos, neg, blocked, category, relevance_score, followers_count, name, location, time_zone in rows: tweeter = promotion.add(screen_name=screen_name, name=name, category=category, relevance_score=relevance_score, location=location, time_zone=time_zone, followers_count=followers_count) # if relevance_score is None: # relevance_score = 0 # adjustment = 0 if blocked > 3 and blocked > pos and relevance_score <= -10: tweeter.new_category = 'B' elif neg > pos and (category is not None or relevance_score != 0 or followers_count >= POWER_TWEEP): if neg > 3: tweeter.adjust_score(-2) else: tweeter.adjust_score(-1) else: # pos >= neg if pos > 3: tweeter.adjust_score(2) elif pos > 1: tweeter.adjust_score(1) current_row += 1 if current_row % 100 == 0: logger.info( f'{current_row:4d}/{row_count} {category} {screen_name}') # Promote top tweeps logger.info('Promoting top tweeps.') db.c.execute('select screen_name from dim_tweeter where category <= ?', ('C', )) rows = db.c.fetchall() famous = [row[0] for row in rows] trenders = rank_words( f'{env.bot_data_directory}/trenders_published_%s.txt', 7) non_famous = [trender for trender in trenders if trender not in famous] for screen_name in non_famous[:50]: tweeter = promotion.add(screen_name=screen_name) tweeter.adjust_score(1) logger.info('Saving changes.') promotion.save_all() # Demote from D db.set_tweeter_category_by_date(date_category_was_set=demotedate_d, current_category='D', new_category='E') # Demote from C db.set_tweeter_category_by_date(date_category_was_set=demotedate_c, current_category='C', new_category='D') db.disconnect()
def main(): # parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.') # parser.add_argument('date', metavar='yyyy-mm-dd', # help='the date to process') # # args = parser.parse_args() environment = defaults.get_environment() db = DB(environment, today()) db_summary = DBSummary(environment) jdata = Publisher.get_pending(environment) # c = db_connect(env.summary_database) trenders_published = list() trenders_all = list() already_processed = list() if 'tweets' in jdata: for tweet in jdata['tweets']: tweet_status = db_summary.get_tweet_status(tweet['t_id']) if tweet_status is None: db_summary.save_tweet(tweet) for item in tweet['items']: db_summary.save_tweet_item(tweet, item) if tweet['type'] == 'trenders' and item['selected'] == 'Y': trenders_all.append(item['tweet_text'][1:]) if tweet['status'] == 'pend-post': trenders_published.append(item['tweet_text'][1:]) elif tweet_status in ['posted', 'rejected']: tweet['status'] = tweet_status already_processed.append(tweet) if len(trenders_published) > 0: with open( f'{environment.bot_data_directory}/trenders_all_{yesterday_file()}.txt', 'a') as f: for sn in trenders_all: f.write("%s\n" % sn) with open( f'{environment.bot_data_directory}/trenders_published_{yesterday_file()}.txt', 'a') as f: for sn in trenders_published: f.write("%s\n" % sn) db_summary.disconnect() trend_date = now() # now = now() # yesterday = (datetime.datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d') if 'trends' in jdata: if len(jdata['trends']) > 0: # c = db_connect() # t = (yesterday,) # c.execute('SELECT max(max_id) max_id FROM tag_history where date <= ?', t) # row = c.fetchone() # max_id = 0 # if row != None: # max_id = row[0] relevant_words = db.get_relevant_words() generic_words = db.get_generic_words() trs = list() for trend in jdata['trends']: tag = '#' + trend['hashtag'].lower() tr = { 'hashtag': trend['hashtag'], 'status': 'posted', 'trend_at': trend_date } trs.append(tr) tag_discovery_result = db.get_tag_discovery_result(tag) status = nvl(tag_discovery_result, 'NONE') if trend['status'] == 'pend-post' and status in ('NONE', 'AUTO_DEL', 'MAN_DEL'): logger.info('Adding: ' + tag) db.save_tag_discovery(tag, 'MAN_ADD') elif trend['status'] == 'pend-del' and status in ('AUTO_ADD', 'MAN_ADD'): logger.info('Deleting: ' + tag) db.save_tag_discovery(tag, 'MAN_DEL') # Trend relevance if 'relevance' in trend: relevance = relevant_words[ trend['hashtag'].lower()] if trend['hashtag'].lower( ) in relevant_words else 'neutral' if trend['relevance'] != relevance: new_relevance = None if trend[ 'relevance'] == 'neutral' else trend['relevance'] db.set_word_relevance(trend['hashtag'], new_relevance) # Trend generic if 'generic' in trend: generic = generic_words[trend['hashtag'].lower( )] if trend['hashtag'].lower() in generic_words else '' if trend['generic'] != generic: new_relevance = None if trend[ 'generic'] == 'neutral' else trend['generic'] db.set_word_generic(trend['hashtag'], new_relevance) data = {'trends': trs} Publisher.publish(environment, data, 'trends') db.commit() open(f'{environment.temp_file_directory}/compute_daily', 'a').close() if 'categories' in jdata: if len(jdata['categories']) > 0: for cat in jdata['categories']: db.set_tweeter_category(cat['screen_name'], cat['category']) logger.info("Category for", cat['screen_name'], "changed to", cat['category']) db.commit() if 'words' in jdata: if len(jdata['words']) > 0: for word in jdata['words']: category = word['category'] if category == '': category = None db.set_word_generic(word['word'], category) logger.info("Generic for", word['word'], "changed to", category) db.disconnect() if len(already_processed) > 0: data = {'tweets': already_processed} Publisher.publish(environment, data, 'posted')
def main(): parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.') parser.add_argument('date', metavar='yyyy-mm-dd', help='the date to process') args = parser.parse_args() environment = defaults.get_environment() db = DB(environment, args.date) db_summary = DBSummary(environment) date_skey = db.get_date_skey(args.date) actions = list() action = {'type': 'trends'} actions.append(action) action = {'type': 'mentions'} actions.append(action) action_ind = 0 tweets = list() stat_tweet_count = 0 while action_ind < len(actions): action = actions[action_ind]['type'] # tweeters = None stats = Stats(args.date, action, db, actions, environment, actions[action_ind]['trend'] if action == 'trenders' else None) i = 100 is_tweetable = True if action == "trenders": tweet = stats.write_tweet(i) if not stats.is_trenders_tweet_postable(tweet) or stat_tweet_count >= DAILY_STAT_TWEET_LIMIT: is_tweetable = False elif action == "trends": tweet = stats.write_tweet(i) elif action == "mentions": tweet = stats.write_tweet(i) if is_tweetable: db_summary.save_tweet(tweet) stat_tweet_count += 1 if tweet is not None: tweets.append(tweet) if len(tweets) >= 2: data = {'tweets': tweets, 'date': args.date} Publisher.publish(environment, data, 'draft') tweets = list() time.sleep(10) action_ind += 1 db_summary.disconnect() # Now get app metrics rows = db.get_tweeter_category_counts() metric_dict = {'date': args.date, 'other': 0} for cat, count in rows: if cat is None: cat = ' ' if cat in ('A', 'B', 'C', 'D', 'E', 'F', 'R', ' '): metric_dict[cat] = count else: metric_dict['other'] += count # Get count of total tweets and tweets by category rows = db.get_tweeter_category_tweet_counts(date_skey) metric_dict['tweets_total'] = 0 metric_dict['tweets_other'] = 0 for cat, count in rows: metric_dict['tweets_total'] += count if cat is None: cat = ' ' if cat in ('A', 'B', 'C', 'D', 'E', 'F', 'R', ' '): metric_dict['tweets_' + cat] = count else: metric_dict['tweets_other'] += count # Add file sizes metric_dict['fact_db_size'] = os.path.getsize(environment.database) metric_dict['dim_db_size'] = os.path.getsize(environment.dimension_database) metric_dict['summ_db_size'] = os.path.getsize(environment.summary_database) followers_count = db.get_tweeter_followers_count('pakpolstats') metric_dict['account_followers'] = followers_count data = {'tweets': tweets, 'metrics': metric_dict, 'date': args.date} Publisher.publish(environment, data, 'draft')
def __init__(self, environment, old_date): self.environment = environment archive_file = environment.dimension_database_old.format( last_month().replace('-', '_')) if os.path.isfile(archive_file): logger.warning('Archive file already exists.') exit(0) # raise ArchiveFileExistsError() self.old_db = sqlite3.connect(environment.dimension_database) self.old_c = self.old_db.cursor() self.old_c.execute('ATTACH DATABASE ? AS se', (environment.database, )) date = None try: self.old_c.execute('select min_date from db_baseline') date = self.old_c.fetchone()[0] except sqlite3.OperationalError: logger.critical('Could not read date.') exit(1) logger.info('Date: %s', date) self.old_db = DB(environment, old_date) # don't connect to new db via regular function, the global connection on db.py should be the main db conn2 = sqlite3.connect(environment.dimension_database_temp) self.new_c = conn2.cursor() db_copy = DBCopy(self.old_c, self.new_c) if date == 'empty': self.old_db.disconnect() conn2.commit() conn2.close() return date_dt = datetime.strptime(date, '%Y-%m-%d') recent_date = (date_dt - timedelta(days=30)).strftime('%Y-%m-%d') twitter_dt = datetime.strptime('2006-03-21 09:00:00', '%Y-%m-%d %H:%M:%S') future_date = (date_dt + timedelta(days=2000)).strftime('%Y-%m-%d') logger.info('Recent date: %s', recent_date) logger.info("DIM_DATE") cnt = 0 dt = twitter_dt.strftime('%Y-%m-%d') i = 1 while dt < future_date: cnt += 1 # print "Date: ", i, dt t = (i, dt) self.new_c.execute( 'insert into dim_date (date_skey, date) values (?, ?)', t) dt = (twitter_dt + timedelta(days=i)).strftime('%Y-%m-%d') i += 1 logger.info('%d rows', cnt) # DIM_TWEETER t = (recent_date, ) sql = """select {} from dim_tweeter where category is not null or ifnull(relevance_score, 0) != 0 or ifnull(bot_date, '2000-01-01') >= ? or tweeter_skey in (select tweeter_skey from fact_daily_followers union select tweeter_skey from fact_daily_hashtag_tweeter union select tweeter_skey from fact_daily_tweeter_mention union select mentioned_tweeter_skey from fact_daily_tweeter_mention union select tweeter_skey from fact_daily_tweeter_word) """ db_copy.copy_table('dim_tweeter', sql, t) # DIM_WORD # Word skeys were getting very large because we only moved the new days words forward, so many words # that weren't used in the few hours of the new day were being created with new skeys. Using yesterday # will slow the skey generation. sql = """select {} from dim_word where generic is not null or relevance is not null or word_skey in ( select word_skey from fact_daily_hashtag union select tag_skey from fact_daily_hashtag_hashtag union select other_tag_skey from fact_daily_hashtag_hashtag union select tag_skey from fact_daily_hashtag_tweeter union select tag_skey from fact_daily_hashtag_word union select word_skey from fact_daily_hashtag_word union select word_skey from fact_daily_tweeter_word)""" db_copy.copy_table('dim_word', sql, None) self.old_db.disconnect() conn2.commit() conn2.close() db_copy.switch_files(current_file=environment.dimension_database, archive_file=archive_file, new_file=environment.dimension_database_temp)
def __init__(self, environment, date): self.environment = environment old_date = yesterday(date) archive_file = environment.database_old.format( old_date.replace('-', '_')) if os.path.isfile(archive_file): raise ArchiveFileExistsError() self.old_db = DB(environment, old_date) # don't connect to new db via regular function, the global connection on db.py should be the main db conn2 = sqlite3.connect(environment.database_temp) self.new_c = conn2.cursor() db_copy = DBCopy(self.old_db.c, self.new_c) if date == 'empty': self.old_db.disconnect() conn2.commit() conn2.close() return date_dt = datetime.strptime(date, '%Y-%m-%d') three_days_ago = (date_dt - timedelta(days=3)).strftime('%Y-%m-%d') recent_date = (date_dt - timedelta(days=7)).strftime('%Y-%m-%d') logger.info('Recent date: [' + recent_date + ']') date_skey = self.old_db.get_date_skey(date) # Special case for db_baseline baseline_id = 0 logger.info('db_baseline') cnt = 0 sql = """select max(id) from fact_status where date_skey < ?""" t = (date_skey, ) self.old_db.c.execute(sql, t) row = self.old_db.c.fetchone() if row is not None: cnt += 1 baseline_id = row[0] t = (row[0], date) self.new_c.execute( 'insert into db_baseline (min_tweet_id, min_date) values (?, ?)', t) logger.info(f'{cnt} rows') # The remaining tables sql = """select {} from tag_history th where max_id >= ?""" t = (baseline_id, ) db_copy.copy_table('tag_history', sql, t) sql = """select {} from (select d.* from tag_discovery d left join tag_score s on d.tag = s.tag group by d.tag, d.result, d.discovery_time having discovery_time >= ? or sum(s.tweet_count) > ?)""" t = (three_days_ago, 50) db_copy.copy_table('tag_discovery', sql, t) sql = """select {} from tag_score where score_time >= ?""" t = (three_days_ago, ) db_copy.copy_table('tag_score', sql, t) # FACT_DAILY_FOLLOWERS sql = 'select {} from fact_daily_followers where date_skey >= ?' t = (date_skey, ) db_copy.copy_table('fact_daily_followers', sql, t) # FACT_DAILY_HASHTAG sql = 'select {} from fact_daily_hashtag where date_skey >= ?' t = (date_skey, ) db_copy.copy_table('fact_daily_hashtag', sql, t) sql = 'select {} from fact_daily_hashtag_hashtag where date_skey >= ?' t = (date_skey, ) db_copy.copy_table('fact_daily_hashtag_hashtag', sql, t) sql = 'select {} from fact_daily_hashtag_tweeter where date_skey >= ?' t = (date_skey, ) db_copy.copy_table('fact_daily_hashtag_tweeter', sql, t) sql = 'select {} from fact_daily_hashtag_word where date_skey >= ?' t = (date_skey, ) db_copy.copy_table('fact_daily_hashtag_word', sql, t) sql = """select {} from fact_daily_tweeter_mention where date_skey >= ?""" t = (date_skey, ) db_copy.copy_table('fact_daily_tweeter_mention', sql, t) sql = 'select {} from fact_daily_tweeter_word where date_skey >= ?' t = (date_skey, ) db_copy.copy_table('fact_daily_tweeter_word', sql, t) # FACT_STATUS RETWEETED sql = 'select {} from fact_status where date_skey = ? and retweeted is not null' t = (date_skey - 1, ) db_copy.copy_table('fact_status', sql, t) # FACT_STATUS sql = 'select {} from fact_status where date_skey >= ?' t = (date_skey, ) db_copy.copy_table('fact_status', sql, t) self.old_db.disconnect() conn2.commit() conn2.close() db_copy.switch_files(current_file=environment.database, archive_file=archive_file, new_file=environment.database_temp)