def scrape_instagram_account_by_id(upstream_id, stub=False): """ Scrape instagram bio data for upstream ID and update a profile. """ db_session = worker.get_session() proxies = _get_proxies(db_session) # Instagram API request. api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] # Update the profile. data = response.json()[0] profile = Profile('instagram', upstream_id, data['screen_name']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site == 'instagram') \ .filter(Profile.upstream_id == upstream_id) \ .one() # Update profile profile.last_update = datetime.now() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = stub db_session.commit() # Schedule followup jobs. app.queue.schedule_index_profile(profile) # index all profiles, inc stubs if not stub: app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()
def _create_sample_profiles(self, config): ''' Create some sample profiles. ''' session = app.database.get_session(self._db) sample_dir = os.path.join(os.path.dirname(__file__), 'sample-data') # Maurice Moss moss_twitter = Profile(site='twitter', upstream_id='12345', username=ProfileUsername( 'maurice.moss', start_date='2014-04-01')) moss_twitter.usernames.append( ProfileUsername('maurice', start_date='2013-06-01', end_date='2014-03-31')) moss_twitter.usernames.append( ProfileUsername('maurie', start_date='2013-02-15', end_date='2013-05-30')) Post(author=moss_twitter, content='Going to the grocery store.', upstream_id='1234', upstream_created='2015-02-04 12:34:50') post = Post(author=moss_twitter, content='Love this band!.', upstream_id='2345', upstream_created='2015-03-01') post.attachments.append( File(name='helloworld.txt', mime='text/plain', content='Hello world!\n\n'.encode('utf8'))) moss_twitter.posts.append(post) with open(os.path.join(sample_dir, 'moss.jpg'), 'rb') as moss_jpg: moss_twitter.avatars.append( Avatar(url='http://foobar.com/moss-avatar.jpg', mime='image/jpeg', image=moss_jpg.read())) moss_twitter.description = "I do IT at Reynholm Industries." moss_twitter.post_count = 1205 moss_twitter.friend_count = 1 moss_twitter.follower_count = 3 moss_twitter.join_date = dateutil.parser.parse('2013-06-01') moss_twitter.join_date_is_exact = False session.add(moss_twitter) # Jen Barber jen_twitter = Profile(site='twitter', upstream_id='23456', username=ProfileUsername( 'jen.barber', start_date='2013-11-12')) jen_twitter.usernames.append( ProfileUsername('jenb', start_date='2013-06-14', end_date='2013-11-12')) jen_twitter.usernames.append( ProfileUsername('jenny', start_date='2013-03-15', end_date='2013-06-14')) with open(os.path.join(sample_dir, 'jen.jpg'), 'rb') as jen_jpg: jen_twitter.avatars.append( Avatar(url='http://foobar.com/jen-avatar.jpg', mime='image/jpeg', image=jen_jpg.read())) jen_twitter.description = "Relationship Manager for the IT department." jen_twitter.post_count = 1543 jen_twitter.friend_count = 1 jen_twitter.follower_count = 1 jen_twitter.join_date = dateutil.parser.parse('2013-03-15') jen_twitter.join_date_is_exact = True moss_twitter.followers.append(jen_twitter) session.add(jen_twitter) # A couple of randos. moss_twitter.followers.append( Profile(site='twitter', upstream_id='345678', username='******')) moss_twitter.followers.append( Profile(site='twitter', upstream_id='456789', username='******')) jen_twitter.followers.append( Profile(site='twitter', upstream_id='567890', username='******')) session.commit()
def _create_sample_profiles(self, config): ''' Create some sample profiles. ''' session = app.database.get_session(self._db) sample_dir = os.path.join(os.path.dirname(__file__), 'sample-data') # Maurice Moss moss_twitter = Profile( site='twitter', upstream_id='12345', username=ProfileUsername('maurice.moss', start_date='2014-04-01') ) moss_twitter.usernames.append(ProfileUsername( 'maurice', start_date='2013-06-01', end_date='2014-03-31' )) moss_twitter.usernames.append(ProfileUsername( 'maurie', start_date='2013-02-15', end_date='2013-05-30' )) Post( author=moss_twitter, content='Going to the grocery store.', upstream_id='1234', upstream_created='2015-02-04 12:34:50' ) post = Post( author=moss_twitter, content='Love this band!.', upstream_id='2345', upstream_created='2015-03-01' ) post.attachments.append(File( name='helloworld.txt', mime='text/plain', content='Hello world!\n\n'.encode('utf8') )) moss_twitter.posts.append(post) with open(os.path.join(sample_dir, 'moss.jpg'), 'rb') as moss_jpg: moss_twitter.avatars.append(Avatar( url='http://foobar.com/moss-avatar.jpg', mime='image/jpeg', image=moss_jpg.read() )) moss_twitter.description = "I do IT at Reynholm Industries." moss_twitter.post_count = 1205 moss_twitter.friend_count = 1 moss_twitter.follower_count = 3 moss_twitter.join_date = dateutil.parser.parse('2013-06-01') moss_twitter.join_date_is_exact = False session.add(moss_twitter) # Jen Barber jen_twitter = Profile( site='twitter', upstream_id='23456', username=ProfileUsername('jen.barber', start_date='2013-11-12') ) jen_twitter.usernames.append(ProfileUsername( 'jenb', start_date='2013-06-14', end_date='2013-11-12' )) jen_twitter.usernames.append(ProfileUsername( 'jenny', start_date='2013-03-15', end_date='2013-06-14' )) with open(os.path.join(sample_dir, 'jen.jpg'), 'rb') as jen_jpg: jen_twitter.avatars.append(Avatar( url='http://foobar.com/jen-avatar.jpg', mime='image/jpeg', image=jen_jpg.read() )) jen_twitter.description = "Relationship Manager for the IT department." jen_twitter.post_count = 1543 jen_twitter.friend_count = 1 jen_twitter.follower_count = 1 jen_twitter.join_date = dateutil.parser.parse('2013-03-15') jen_twitter.join_date_is_exact = True moss_twitter.followers.append(jen_twitter) session.add(jen_twitter) # A couple of randos. moss_twitter.followers.append(Profile( site='twitter', upstream_id='345678', username='******' )) moss_twitter.followers.append(Profile( site='twitter', upstream_id='456789', username='******' )) jen_twitter.followers.append(Profile( site='twitter', upstream_id='567890', username='******' )) session.commit()
def scrape_instagram_account(username, stub=False): """ Scrape instagram bio data and create (or update) a profile. """ # Getting a user ID is more difficult than it ought to be: you need to # search for the username and iterate through the search results results to # find an exact match. db_session = worker.get_session() proxies = _get_proxies(db_session) api_url = 'https://api.instagram.com/v1/users/search' params = {'q': username} response = requests.get( api_url, params=params, proxies=proxies, verify=False ) response.raise_for_status() search_results = response.json() username_lower = username.lower() user_id = None for user_result in search_results['data']: if user_result['username'].lower() == username_lower: user_id = user_result['id'] break if user_id is None: raise ScrapeException('Can\'t find Instagram user named {}.' .format(username)) # Now make another request to get this user's profile data. api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] profile = Profile('instagram', user_id, data['username']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site == 'instagram') \ .filter(Profile.upstream_id == user_id) \ .one() profile.last_update = datetime.now() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = stub db_session.commit() # Schedule followup jobs. app.queue.schedule_index_profile(profile) # index all profiles, inc stubs if not stub: app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()
def scrape_instagram_account(username): ''' Scrape instagram bio data and create (or update) a profile. ''' # Getting a user ID is more difficult than it ought to be: you need to # search for the username and iterate through the search results results to # find an exact match. db_session = worker.get_session() proxies = _get_proxies(db_session) api_url = 'https://api.instagram.com/v1/users/search' params = {'q': username} response = requests.get( api_url, params=params, proxies=proxies, verify=False ) response.raise_for_status() search_results = response.json() username_lower = username.lower() user_id = None for user_result in search_results['data']: if user_result['username'].lower() == username_lower: user_id = user_result['id'] break if user_id is None: raise ScrapeException('Can\'t find Instagram user named {}.' .format(username)) # Now make another request to get this user's profile data. api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id) response = requests.get( api_url, proxies=proxies, verify=False ) response.raise_for_status() data = response.json()['data'] profile = Profile('instagram', user_id, data['username']) db_session.add(profile) try: db_session.commit() except IntegrityError: # Already exists: use the existing profile. db_session.rollback() profile = db_session.query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==user_id) \ .one() profile.description = data['bio'] profile.follower_count = int(data['counts']['followed_by']) profile.friend_count = int(data['counts']['follows']) profile.homepage = data['website'] profile.name = data['full_name'] profile.post_count = int(data['counts']['media']) profile.is_stub = False db_session.commit() # Schedule followup jobs. app.queue.schedule_avatar(profile, data['profile_picture']) app.queue.schedule_index_profile(profile) app.queue.schedule_posts(profile, recent=True) app.queue.schedule_relations(profile) return profile.as_dict()