def snapshot_mailchimp(verbose=False): api_key = os.environ.get('MAILCHIMP_API_KEY') assert api_key, 'No MAILCHIMP_API_KEY defined in environment.' pm = PostMonkey(api_key, timeout=10) ping_string = pm.ping() expected = u'Everything\'s Chimpy!' assert ping_string==expected, 'Bad handshake, got "%s", expected "%s"' % (ping_string,expected) if verbose: print 'handshake ok' lists = pm.lists() if not 'data' in lists: print 'Got bad lists object from server.' pprint(lists) raise ValueError('Bad lists object from server') # Snapshot creation code... today = datetime.now().date() for l in lists['data']: try: if verbose: print 'Scraping %s...' % l['name'] latest = Session.query(model.SnapshotOfMailchimp)\ .filter(model.SnapshotOfMailchimp.name==l['name'])\ .order_by(model.SnapshotOfMailchimp.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' continue snapshot = model.SnapshotOfMailchimp(\ name = l['name'],\ members = l['stats']['member_count'], timestamp = today) if verbose: print ' -> ',snapshot.toJson() Session.add(snapshot) Session.commit() except Exception, e: pprint({'list':l,'exception':str(e)})
def get_activity(verbose=False): lists = util.list_mailman_lists(verbose) for l in lists: if verbose: print "Processing activity for %s..." % l["name"] latest = ( Session.query(ActivityInMailman) .filter(ActivityInMailman.list_name == l["name"]) .order_by(ActivityInMailman.message_id.desc()) .first() ) # Walk through message history from the web front-end archive_url = l["link"].replace("mailman/listinfo", "pipermail") limit = 1000 latest_id = latest.message_id if latest else -1 for msg in _yield_messages(archive_url, latest_id, verbose=verbose): if verbose: print ' -> got msg #%d (%s: "%s")' % (msg["id"], msg["email"], msg["subject"]) Session.add( ActivityInMailman( list_name=l["name"], message_id=msg["id"], subject=msg["subject"], author=msg["author"], email=msg["email"], link=msg["link"], timestamp=msg["date"], ) ) limit -= 1 # if limit==0: # if verbose: print ' -> Reached activity limit (100)' # break; Session.commit()
def snapshot_twitteraccounts(verbose=False): """Create today's SnapshotOfTwitterAccounts""" api = open_api() friends = api.GetFriends() for friend in friends: if verbose: print 'Scraping %s...' % friend.screen_name screen_name = friend.screen_name.lower() if screen_name=='theannotator': # legacy reasons screen_name = 'TheAnnotator' followers = friend.followers_count following = friend.friends_count tweets = friend.statuses_count today = datetime.now().date() # How long since we scraped this account? latest = Session.query(SnapshotOfTwitterAccount)\ .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\ .order_by(SnapshotOfTwitterAccount.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name continue # Create a snapshot sn = SnapshotOfTwitterAccount(\ timestamp=today,\ screen_name=screen_name,\ followers=followers,\ following=following,\ tweets=tweets) Session.add(sn) if verbose: print ' -> ',sn.toJson() Session.commit()
def snapshot_twitteraccounts(verbose=False): """Create today's SnapshotOfTwitterAccounts""" api = tweepy.API() for screen_name in TRACKED_ACCOUNTS: if verbose: print 'Scraping %s...' % screen_name u = api.get_user(screen_name) followers = u.followers_count following = u.friends_count tweets = u.statuses_count today = datetime.now().date() # How long since we scraped this account? latest = Session.query(SnapshotOfTwitterAccount)\ .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\ .order_by(SnapshotOfTwitterAccount.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name continue # Create a snapshot sn = SnapshotOfTwitterAccount(\ timestamp=today,\ screen_name=screen_name,\ followers=followers,\ following=following,\ tweets=tweets) Session.add(sn) if verbose: print ' -> ',sn.toJson() Session.commit()
def snapshot_mailman(verbose=False): lists = util.list_mailman_lists(verbose) today = datetime.now().date() for l in lists: if verbose: print 'Processing snapshots for %s...' % l['name'] latest = Session.query(SnapshotOfMailman)\ .filter(SnapshotOfMailman.list_name==l['name'])\ .order_by(SnapshotOfMailman.timestamp.desc())\ .first() # By default, gather 30 days of snapshots since = today - timedelta(days=180) if latest: if latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' continue since = latest.timestamp + timedelta(days=1) # Download subscriber list roster_url = l['link'].replace('listinfo','roster') num_subscribers = len(_scrape_subscribers(roster_url, verbose=verbose)) # Create a snapshot of each day while since<today: posts_today = Session.query(ActivityInMailman)\ .filter(ActivityInMailman.list_name==l['name'])\ .filter(ActivityInMailman.timestamp.between(since,since+timedelta(days=1)))\ .count() sn = SnapshotOfMailman(\ list_name=l['name'],\ timestamp=since,\ subscribers=num_subscribers, posts_today=posts_today) Session.add(sn) if verbose: print ' -> ',sn.toJson() since += timedelta(days=1) # Walk through message history, counting messages per day Session.commit()
def snapshot_googleanalytics(verbose=False): googleanalytics_auth_json=os.environ.get('GOOGLEANALYTICS_AUTH') # Authenticate and construct service. service = initialize_service(googleanalytics_auth_json) date_string = (datetime.now()-timedelta(days=4)).date().isoformat() if verbose: print 'Snapshotting for '+date_string for x in iterate_profiles(service): try: profile_id = x['id'] # How long since we scraped this account? latest = Session.query(SnapshotOfAnalytics)\ .filter(SnapshotOfAnalytics.website==x['name'])\ .order_by(SnapshotOfAnalytics.timestamp.desc())\ .first() day = (datetime.now()-timedelta(days=1)).date() if latest and latest.timestamp>=day: if verbose: print ' -> most recent snapshot for %s has already been processed.' % x['name'] continue hits = get_hits(service, profile_id, day.isoformat()) sn = SnapshotOfAnalytics(timestamp=day,website=x['name'],hits=hits) Session.add(sn) if verbose: print '%s: %d' % (x['name'], hits) except Exception, e: print e
def snapshot_repos(verbose=False): """Create SnapshotOfRepo objects in the database for every day since the last time this was run.""" repo_list = _get_repo_list(verbose) today = datetime.now().date() for (repo_name,repo) in repo_list.items(): if verbose: print 'Processing snapshots for %s...' % repo_name latest = Session.query(SnapshotOfGithub)\ .filter(SnapshotOfGithub.repo_name==repo_name)\ .order_by(SnapshotOfGithub.timestamp.desc())\ .first() # By default, gather 30 days of snapshots if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' continue # Snapshot date for the last day (or more) snapshot = SnapshotOfGithub( timestamp=today, repo_name=repo_name, open_issues=repo.open_issues, size=repo.size, watchers=repo.watchers, forks=repo.forks ) if verbose: print ' -> ',snapshot.toJson() Session.add(snapshot) Session.commit()
def snapshot_facebook(verbose=False): api = facebook.GraphAPI() obj = api.get_object('/OKFNetwork') if not 'likes' in obj: print 'Got bad object from server.' pprint(obj) raise ValueError('Bad object from server') likes = obj['likes'] if verbose: print 'Likes today: %d' % likes # Snapshot creation code... today = datetime.now().date() latest = Session.query(model.SnapshotOfFacebook)\ .order_by(model.SnapshotOfFacebook.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' return snapshot = model.SnapshotOfFacebook(likes=likes, timestamp=today) if verbose: print ' -> ',snapshot.toJson() Session.add(snapshot) Session.commit()