def splash_request(target_url, headers={}, request_timeout=10): ''' Ask splash to render a page. ''' db_session = worker.get_session() splash_url = get_config(db_session, 'splash_url', required=True).value splash_user = get_config(db_session, 'splash_user', required=True).value splash_pass = get_config(db_session, 'splash_password', required=True).value auth = (splash_user, splash_pass) splash_headers = {'content-type': 'application/json'} if 'user-agent' not in [header.lower() for header in headers.keys()]: headers['user-agent'] = USER_AGENT payload = { 'url': target_url, 'html': 1, 'jpeg': 1, 'har': 1, 'history': 1, 'timeout': request_timeout, 'resource_timeout': 5, 'headers': headers } splash_response = requests.post(urljoin(splash_url, 'render.json'), headers=splash_headers, json=payload, auth=auth) return splash_response
def splash_request(target_url, headers={}, request_timeout=None, wait=1, use_proxy=False): ''' Ask splash to render a page. ''' db_session = worker.get_session() splash_url = get_config(db_session, 'splash_url', required=True).value splash_user = get_config(db_session, 'splash_user', required=True).value splash_pass = get_config(db_session, 'splash_password', required=True).value splash_user_agent = get_config(db_session, 'splash_user_agent', required=True).value proxy = None if request_timeout is None: try: request_timeout = int( get_config(db_session, 'splash_request_timeout', required=True).value) except: raise ScrapeException('Request timeout must be an integer: {}', request_timeout) auth = (splash_user, splash_pass) splash_headers = {'content-type': 'application/json'} if 'user-agent' not in [header.lower() for header in headers.keys()]: headers['user-agent'] = splash_user_agent payload = { 'url': target_url, 'html': 1, 'jpeg': 1, 'har': 1, 'history': 1, 'wait': wait, 'render_all': 1, 'width': 1024, 'height': 768, 'timeout': request_timeout, 'resource_timeout': 5, 'headers': headers } # Use proxy if enabled if use_proxy: proxy = random_proxy(db_session) if proxy: payload['proxy'] = proxy splash_response = requests.post(urljoin(splash_url, 'render.json'), headers=splash_headers, json=payload, auth=auth) return splash_response
def index(self): ''' Get QCR intents data. **Example Response** .. sourcecode:: json { "Google Maps":{ "intents":{ "geoloc":"@{{geoloc.lat}},{{geoloc.long}},12z", "geobounds":"@{{geobounds.lat0}},{{geobounds.long0}},12z" }, "hide":true, "name":"Google Maps", "url":"https://www.google.com/maps", "desc":"Interactive maps", "thumbnail":"googlemaps.png", "icon":"googlemaps.png" }, ... } :status 200: ok :status 401: authentication required :status 404: intents not found ''' url = get_config(g.db, 'intents_url', required=True) username = get_config(g.db, 'intents_username', required=True) print(username) password = get_config(g.db, 'intents_password', required=True) if url is None or url.value.strip() == '': raise NotFound('Intents url is not configured.') if username is None or password is None: raise NotFound('Intents credentials not configured.') if username.value.strip() == '' or password.value.strip == '': raise NotFound('Intents credentials not configured.') try: response = requests.get( url.value, auth=HTTPBasicAuth(username.value, password.value), verify=False, timeout=5 ) except: raise NotFound('Intents data could not be retrieved from server.') return jsonify(response.json())
def index(self): ''' Get QCR intents data. **Example Response** .. sourcecode:: json { "Google Maps":{ "intents":{ "geoloc":"@{{geoloc.lat}},{{geoloc.long}},12z", "geobounds":"@{{geobounds.lat0}},{{geobounds.long0}},12z" }, "hide":true, "name":"Google Maps", "url":"https://www.google.com/maps", "desc":"Interactive maps", "thumbnail":"googlemaps.png", "icon":"googlemaps.png" }, ... } :status 200: ok :status 401: authentication required :status 404: intents not found ''' url = get_config(g.db, 'intents_url', required=True) username = get_config(g.db, 'intents_username', required=True) print(username) password = get_config(g.db, 'intents_password', required=True) if url is None or url.value.strip() == '': raise NotFound('Intents url is not configured.') if username is None or password is None: raise NotFound('Intents credentials not configured.') if username.value.strip() == '' or password.value.strip == '': raise NotFound('Intents credentials not configured.') try: response = requests.get(url.value, auth=HTTPBasicAuth(username.value, password.value), verify=False, timeout=5) except: raise NotFound('Intents data could not be retrieved from server.') return jsonify(response.json())
def handle_invitation_response(rsvp_uuid, response): """Process an incoming reply from the form generated from a mailed invitation UUID.""" global server_conf server_conf = configuration.get_config('server') page_pieces.set_server_conf() person_responding = person.Person.find(database.find_rsvp(rsvp_uuid)) # set up viewing as though the user has actually logged in access_permissions.Access_Permissions.setup_access_permissions(person_responding.link_id) # todo: also tell django that they are effectively logged in? event_responding = event.Event.find_by_id(person_responding.invitations[rsvp_uuid]) if response == 'accept': event_responding.add_invitation_accepted([person_responding]) makers_server.generate_page('accepted', person_responding.name(), event_responding.title) elif response == 'decline': event_responding.add_invitation_declined([person_responding]) makers_server.generate_page('declined', person_responding.name(), event_responding.title) elif response == 'drop': event_responding.add_invitation_declined([person_responding]) person_responding.remove_training_request(event_responding.training_for_role(), event_responding.equipment_type) makers_server.generate_page('dropped', person_responding.name(), event_responding.title) elif response == '': makers_server.generate_page('rsvp_choices', person_responding.name(), event_responding.title) else: makers_server.generate_page('rsvp_error', person_responding.name(), event_responding.title)
def update_controls(self, params): config = model.configuration.get_config() default_visibilities = config['privacy_defaults'] print("update_controls", params) self.visibility['host'] = to_bool_or_other( params.get('visibility_as_host', default_visibilities['visibility_as_host'])) self.visibility['attendee'] = to_bool_or_other( params.get('visibility_as_attendee', default_visibilities['visibility_as_attendee'])) self.visibility['general'] = to_bool_or_other( params.get('visibility_in_general', default_visibilities['visibility_in_general'])) stylesheet = os.path.basename(params.get('stylesheet', "makers")) if stylesheet in model.configuration.get_stylesheets(): # use basename so the user can't pick unvetted styles (in case # of malicious stuff in them, in case you can do that in css) self.stylesheet = stylesheet else: # use the default if the specified one doesn't exist self.stylesheet = configuration.get_config('page', 'stylesheet') self.show_help = to_bool_or_other(params.get('display_help', False)) self.notify_by_email = to_bool_or_other( params.get('notify_by_email', False)) self.notify_in_site = to_bool_or_other( params.get('notify_in_site', False)) self.save()
def main(): """Program to remove person entries completely. Originally meant for removing accidental duplicates.""" parser = argparse.ArgumentParser() parser.add_argument( "-d", "--deletions", help="""File containing the link_ids to delete, one per line""") parser.add_argument("-f", "--for-real", action='store_true', help="""Without this flag, only do a dummy run.""") args = parser.parse_args() for_real = args.for_real config = configuration.get_config() db_config = config['database'] collection_names = db_config['collections'] database.database_init(config) with open(args.deletions) as deletions_file: for del_link_id in deletions_file.readlines(): if for_real: print("Deleting", del_link_id) print("Result:", database.delete_by_link_id(del_link_id)) else: print("Would delete", del_link_id)
def user_list_section(django_request, include_non_members=False, filter_fn=None, filter_opaque=None): """Return the users list, if the viewing person is allowed to see it. Otherwise, just how many people there are. The optional first argument is a flag for whether to include non-members. The optional second argument is a boolean function taking a person object, returning whether to include them in the list. This could be used for things like listing people whose fobs are ready for enabling, or who have missed paying their latest subscription. A third argument is passed through to that function.""" global serverconf if serverconf == None: serverconf = configuration.get_config()['server'] viewing_user = model.person.Person.find(django_request.user.link_id) people = person.Person.list_all_people( ) if include_non_members else person.Person.list_all_members() if filter_fn: people = [ someone for someone in people if filter_fn(someone, filter_opaque) ] people_dict = {whoever.name(): whoever for whoever in people} if viewing_user.is_auditor() or viewing_user.is_admin(): return T.table[[ T.tr[T.th(class_='mem_num')["Mem #"], T.th(class_='username')["Name"], T.th(class_='loginh')["Login"], T.th(class_='flagsh')["Flags"], T.th(class_='email')["Email"], T.th(class_='user')["User"], T.th(class_='owner')["Owner"], T.th(class_='trainer')["Trainer"], T.th(class_='note')["Notes"]] ], [ T.tr[T.td(class_='mem_num')[str(who.membership_number)], T.th(class_='username')[T.a( href=django.urls. reverse('dashboard:user_dashboard', args=( [who.link_id])))[whoname]], T.td(class_='login')[who.get_login_name() or ""], T.td(class_='flags')[flagstring(who)], T.td(class_='email')[T.a( href="mailto:" + who.get_email() or "")[who.get_email() or ""]], T.td(class_='user' )[equipment_type_role_name_list(who, 'user')], T.td(class_='owner' )[equipment_type_role_name_list(who, 'owner')], T.td(class_='trainer' )[equipment_type_role_name_list(who, 'trainer')], T.td(class_='note')[T.form()[who.get_admin_note() or ""]]] for (whoname, who) in [(key, people_dict[key]) for key in sorted(people_dict.keys())] ]] else: return T.p["There are " + str(len(people)) + (" people" if include_non_members else " members") + " in the database."]
def _get_proxies(db): """ Get a dictionary of proxy information from the app configuration. """ piscina_url = get_config(db, 'piscina_proxy_url', required=True) if piscina_url is None or piscina_url.value.strip() == '': raise ScrapeException('No Piscina server configured.') return { 'http': piscina_url.value, 'https': piscina_url.value, }
def _get_proxies(db): ''' Get a dictionary of proxy information from the app configuration. ''' piscina_url = get_config(db, 'piscina_proxy_url', required=True) if piscina_url is None or piscina_url.value.strip() == '': raise ScrapeException('No Piscina server configured.') return { 'http': piscina_url.value, 'https': piscina_url.value, }
def equipment_type_list_section(training_category): global serverconf global org_conf if serverconf == None: serverconf = configuration.get_config('server') if orgconf == None: orgconf = configuration.get_config('organization') eqtys = equipment_type.Equipment_type.list_equipment_types(training_category) print("training_category is", training_category, "and its types are", eqtys) return [T.h2[(T.a(href=orgconf['categories']+training_category.upper())[training_category.capitalize()] or "All") + " equipment types"], [T.dl[[[T.dt[T.a(href=serverconf['types']+eqty.name)[eqty.pretty_name()]], T.dd[T.dl[T.dt["Machines"], [T.ul(class_="compactlist")[[T.li[T.a(href=serverconf['machines']+m.name)[m.name]] for m in eqty.get_machines()]]], T.dt["Training requests"], T.dd[ # todo: no training requests are visible (check whether they are even created) T.ul(class_="compactlist")[[T.li[r.name()] for r in eqty.get_training_requests('user')]] ]]]] for eqty in eqtys]]]]
def help_for_topic( help_name, default_text="<p>Help text not available for topic %(topic)s</p>", substitutions={}): help_file = os.path.join(configuration.get_config('page', 'help_texts'), help_name + ".html") if os.path.isfile(help_file): with open(help_file) as helpstream: return helpstream.read() % substitutions # this has to be a dictionary substitution because otherwise # default_text must contain a substitution marker: return default_text % {'topic': help_name}
def export0(args): verbose = args.verbose config = configuration.get_config() db_config = config['database'] collection_names = db_config['collections'] if verbose: print("collection names are", collection_names) database.database_init(config) if args.all: model.backup_to_csv.make_database_backup(tarballfilename=args.all) else: if args.users: export_role('user', args.users) if args.owners: export_role('owner', args.owners) if args.trainers: export_role('trainer', args.trainers)
def _splash_request(db_session, username, site, request_timeout): ''' Ask splash to render a page for us. ''' target_url = site.get_url(username) splash_url = get_config(db_session, 'splash_url', required=True).value splash_headers = { 'User-Agent': USER_AGENT, } splash_params = { 'url': target_url, 'html': 1, 'jpeg': 1, 'history': 1, 'timeout': request_timeout, 'resource_timeout': 5, } splash_response = requests.get( urljoin(splash_url, 'render.json'), headers=splash_headers, params=splash_params ) result = { 'code': splash_response.status_code, 'error': None, 'image': None, 'site': site.as_dict(), 'url': target_url, } splash_data = splash_response.json() try: splash_response.raise_for_status() if _check_splash_response(site, splash_response, splash_data): result['status'] = 'f' else: result['status'] = 'n' result['image'] = splash_data['jpeg'] except Exception as e: result['status'] = 'e' result['error'] = str(e) return result
def invitation_response_form_page(rsvp_uuid): """From an invitation UUID that was mailed to someone, produce a response form.""" global server_conf server_conf = configuration.get_config('server') page_pieces.set_server_conf() person_responding = person.Person.find(database.find_rsvp(rsvp_uuid)) # set up viewing as though the user has actually logged in access_permissions.Access_Permissions.setup_access_permissions(person_responding.link_id) # todo: also tell django that they are effectively logged in? event_responding = event.Event.find_by_id(person_responding.invitations[rsvp_uuid]) form_act = django.urls.reverse("events:rsvp_form", args=[rsvp_uuid]) return T.div(class_="invitresp")[ T.h1["RSVP for " + person_responding.name(access_permissions_event=event_responding)], T.p["This is a " + event_responding.event_type + " event starting at " + str(event_responding.start) + ". The event will be hosted by " + ". and ".join([obj.name(access_permissions_role='host') for obj in event_responding.hosts if obj is not None]) + "."], T.form(action=form_act, method='POST')[ T.input(type="hidden", name="rsvp_uuid", value=rsvp_uuid), T.table[T.tr[T.td[T.input(type='radio', name='rsvp', value='accept')], T.td["Accept invitation"]], T.tr[T.td[T.input(type='radio', name='rsvp', value='decline')], T.td["Decline invitation"]], T.tr[T.td[T.input(type='radio', name='rsvp', value='drop')], T.td["Decline invitation and cancel training request"]]], T.input(type="submit", value="Send response")]]
def main(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--collection", default='profiles') parser.add_argument("-o", "--output", default=None) args = parser.parse_args() config = configuration.get_config() db_config = config['database'] collection_names = db_config['collections'] database.database_init(config) collection = args.collection output_name = args.output or (collection + ".csv") rows = database.get_collection_rows(collection) keys = [] for row in rows: for key in row.keys(): if key not in keys: keys.append(key) print("keys are", keys) with open(output_name, 'w') as csvfile: writer = csv.DictWriter(csvfile, keys) writer.writeheader() for row in rows: writer.writerow(row)
def get_slots_conf(): """Get the timeslots information from the configuration file. It is cached once found, as it shouldn't change during a run. The results are a list of days of the week in order, a dictionary defining the named time periods in each day as a list of the start and end times, and a list of the named periods in order of their starting times.""" global day_order, periods, period_order slotconf = configuration.get_config('timeslots') if day_order is None: day_order = slotconf['days'] if periods is None: periods = { pname: [ pval for pval in map(lambda pair: datetime.time(pair[0], pair[1]), [ [ int(x) for x in slot.strip().split(':') ] for slot in pdescr.split('--') ]) ] for pname, pdescr in slotconf['periods'].items() } if period_order is None: tmp = { startend[0]: name for name, startend in periods.items() } period_order = [ tmp[tm] for tm in sorted(tmp.keys()) ] if len(period_order) < 4: period_order.append('Other') return day_order, periods, period_order
def post(self): ''' Process user payment. **Example Request** .. sourcecode:: json { "user_id": 1, "stripe_token": "tok_1A9VDuL25MRJTn0APWrFQrN6", "credits": 400 "currency": "usd", "description": "200 credits for $20", } **Example Response** .. sourcecode:: json { "message": "200 credits added." } :<header Content-Type: application/json :<header X-Auth: the client's auth token :<json int user_id: the user ID :<json str stripe_token: the stripe payment token :<json int credits: the purchase credits :>header Content-Type: application/json :>json string message: API response message :status 200: ok :status 400: invalid request body :status 401: authentication required :status 403: not authorized to make the requested changes ''' # Validate json input request_json = request.get_json() validate_request_json(request_json, _payment_attrs) user = g.db.query(User).filter( User.id == request_json['user_id']).first() if g.user.id != user.id: raise Forbidden('You may only purchase credits for ' 'your own account.') # Configure stripe client try: stripe.api_key = get_config(session=g.db, key='stripe_secret_key', required=True).value except Exception as e: raise ServiceUnavailable(e) key = 'credit_cost' credit_cost = g.db.query(Configuration) \ .filter(Configuration.key == key) \ .first() if credit_cost is None: raise NotFound( 'There is no configuration item named "{}".'.format(key)) # Stripe token is created client-side using Stripe.js token = request_json['stripe_token'] # Get payment paremeters credits = int(request_json['credits']) description = request_json['description'] currency = request_json['currency'] costs = self._get_costs(credit_cost.value) # Calculate credit amount. try: amount = costs[credits] # credits = list(costs.keys())[list( # costs.values()).index(int(amount))] except IndexError: raise BadRequest('Invalid credit amount.') try: # Charge the user's card: charge = stripe.Charge.create(amount=amount, currency=currency, description=description, source=token) except stripe.error.CardError as e: # Since it's a decline, stripe.error.CardError will be caught body = e.json_body err = body['error'] raise BadRequest('Card error: {}'.format(err['message'])) except stripe.error.RateLimitError as e: # Too many requests made to the API too quickly body = e.json_body err = body['error'] raise BadRequest('Rate limit error: {}'.format(err['message'])) except stripe.error.InvalidRequestError as e: # Invalid parameters were supplied to Stripe's API body = e.json_body err = body['error'] raise BadRequest('Invalid parameters: {}'.format(err['message'])) except stripe.error.AuthenticationError as e: # Authentication with Stripe's API failed # (maybe API keys changed recently) body = e.json_body err = body['error'] raise ServiceUnavailable('Stripe authentication error: {}'.format( err['message'])) except stripe.error.APIConnectionError as e: # Network communication with Stripe failed body = e.json_body err = body['error'] raise ServiceUnavailable( 'Stripe API communication failed: {}'.format(err['message'])) except stripe.error.StripeError as e: # Generic error body = e.json_body err = body['error'] raise ServiceUnavailable('Stripe error: {}'.format(err['message'])) except Exception as e: # Something else happened, completely unrelated to Stripe raise ServiceUnavailable('Error: {}'.format(e)) user.credits += credits g.db.commit() g.redis.publish('user', json.dumps(user.as_dict())) message = '{} credits added.'.format(amount) response = jsonify(message=message) response.status_code = 202 return response
def scrape_twitter_posts(id_, recent): ''' Fetch tweets for the user identified by id_. Checks tweets already stored in db, and will only fetch older or newer tweets depending on value of the boolean argument 'recent', e.g. recent=True will return recent tweets not already stored in the db. The number of tweets to fetch is configured in the Admin. ''' db = worker.get_session() #max_results = _get_max_posts(db)['twitter'] max_results = get_config(db, 'max_posts_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_twitter must be an integer') worker.start_job(total=max_results) redis = worker.get_redis() author = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) results = 0 max_id = None more_results = True count = 200 if author is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get posts currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) url = 'https://api.twitter.com/1.1/statuses/user_timeline.json' params = {'count': count, 'user_id': author.upstream_id} if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: since_id = post_query[0].upstream_id params['since_id'] = str(since_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() -1].upstream_id params['max_id'] = str(max_id) while more_results: response = requests.get( url, params=params, proxies=proxies, verify=False ) response.raise_for_status() post_ids = list() tweets = response.json() if len(tweets) == 0: more_results = False if len(tweets) < count: more_results = False for tweet in tweets: # Twitter API result set includes the tweet with the max_id/since_id # so ignore it. if tweet['id_str'] != max_id: post = Post( author, tweet['id_str'], dateutil.parser.parse(tweet['created_at']), tweet['text'] ) if tweet['lang'] is not None: post.language = tweet['lang'] if tweet['coordinates'] is not None: post.latitude, post.longitude = tweet['coordinates'] place = tweet['place'] if place is not None: # Set longitude/latitude to the center the of bounding polygon. total_lon = 0 total_lat = 0 num_coords = 0 for lon, lat in place['bounding_box']['coordinates'][0]: total_lon += lon total_lat += lat num_coords += 1 post.longitude = total_lon / num_coords post.latitude = total_lat / num_coords # Set location to string identifying the place. post.location = '{}, {}'.format( place['full_name'], place['country'] ) db.add(post) db.flush() post_ids.append(post.id) # Set the max_id to the last tweet to get the next set of # results max_id = tweet['id_str'] params['max_id'] = max_id results += 1 worker.update_job(current=results) if results == max_results: more_results = False break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)
def scrape_twitter_relations(id_): ''' Fetch friends and followers for the Twitter user identified by `id_`. The number of friends and followers to fetch is configured in Admin. ''' redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) #max_results = _get_max_relations(db)['twitter'] max_results = get_config(db, 'max_relations_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_twitter must be an integer' ) friends_results = 0 friends_ids = [] followers_results = 0 followers_ids = [] friends_cursor = -1 followers_cursor = -1 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) params = { 'count': 5000, 'user_id': profile.upstream_id, 'stringify_ids': True, } # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] ## Get friend IDs. friends_url = 'https://api.twitter.com/1.1/friends/ids.json' params['cursor'] = friends_cursor while friends_results < max_results: friends_response = requests.get( friends_url, params=params, proxies=proxies, verify=False ) friends_response.raise_for_status() # Ignore friends already in the db for friend_id in friends_response.json()['ids']: if friend_id not in current_friends_ids: friends_ids.append(friend_id) friends_results += 1 if friends_results == max_results: break friends_cursor = friends_response.json()['next_cursor'] if friends_cursor == 0: break # No more results else: params['cursor'] = friends_cursor # Get follower IDs. followers_url = 'https://api.twitter.com/1.1/followers/ids.json' params['cursor'] = followers_cursor while followers_results < max_results: followers_response = requests.get( followers_url, params=params, proxies=proxies, verify=False ) followers_response.raise_for_status() # Ignore followers already in the db for follower_id in followers_response.json()['ids']: if follower_id not in current_followers_ids: followers_ids.append(follower_id) followers_results += 1 if followers_results == max_results: break followers_cursor = followers_response.json()['next_cursor'] if followers_cursor == 0: break # No more results else: params['cursor'] = followers_cursor # Get username for each of the friend/follower IDs and create # a relationship in QuickPin. user_ids = [(uid, 'friend') for uid in friends_ids] + \ [(uid, 'follower') for uid in followers_ids] worker.start_job(total=len(user_ids)) chunk_size = 100 for chunk_start in range(0, len(user_ids), chunk_size): chunk_end = chunk_start + chunk_size chunk = user_ids[chunk_start:chunk_end] chunk_lookup = {id_:relation for id_,relation in chunk} lookup_url = 'https://api.twitter.com/1.1/users/lookup.json' lookup_response = requests.post( lookup_url, proxies=_get_proxies(db), verify=False, data={'user_id': ','.join(chunk_lookup.keys())} ) lookup_response.raise_for_status() relations = lookup_response.json() for related_dict in relations: uid = related_dict['id_str'] username = related_dict['screen_name'] related_profile = Profile('twitter', uid, username, is_stub=True) db.add(related_profile) try: db.commit() except IntegrityError: # Already exists: use the existing profile. db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='twitter') \ .filter(Profile.upstream_id==uid) \ .one() _twitter_populate_profile(related_dict, related_profile) relation = chunk_lookup[uid] if relation == 'friend': profile.friends.append(related_profile) else: # relation == 'follower': profile.followers.append(related_profile) db.commit() worker.update_job(current=chunk_end) db.commit() worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_instagram_posts(id_, recent): """ Fetch instagram posts for the user identified by id_. Checks posts already stored in db, and will only fetch older or newer posts depending on value of the boolean argument 'recent', e.g. recent=True will return recent posts not already stored in the db. The number of posts to fetch is configured in the Admin. """ redis = worker.get_redis() db = worker.get_session() author = db.query(Profile).filter(Profile.id == id_).first() proxies = _get_proxies(db) max_results = get_config(db, 'max_posts_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_instagram must be an integer') min_id = None results = 0 params = {} if author is None: raise ValueError('No profile exists with id={}'.format(id_)) url = 'https://api.instagram.com/v1/users/{}/media/recent' \ .format(author.upstream_id) # Get last post currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) \ if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: min_id = post_query[0].upstream_id params['min_id'] = str(min_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() - 1].upstream_id params['max_id'] = str(max_id) worker.start_job(total=max_results) while results < max_results: response = requests.get( url, params=params, proxies=proxies, verify=False ) response.raise_for_status() post_ids = list() response_json = response.json()['data'] pagination = response.json()['pagination'] # Instagram API result includes post with min_id so remove it response_json[:] = [d for d in response_json if d.get('id') != min_id] for gram in response_json: if gram['caption'] is not None: text = gram['caption']['text'] else: text = None post = Post( author, gram['id'], datetime.fromtimestamp(int(gram['created_time'])), text ) if gram['location'] is not None: if 'latitude' in gram['location']: post.latitude = gram['location']['latitude'] post.longitude = gram['location']['longitude'] if 'name' in gram['location']: post.location = gram['location']['name'] if 'street_address' in gram['location']: post.location += ' ' + gram['location']['street_address'] if 'images' in gram: image_url = gram['images']['standard_resolution']['url'] name = os.path.basename(urlparse(image_url).path) img_response = requests.get(image_url, verify=False) mime = img_response.headers['Content-type'] image = img_response.content post.attachments.append(File(name, mime, image)) db.add(post) db.flush() post_ids.append(post.id) worker.update_job(current=results) results += 1 if results == max_results: break # If there are more results, set the max_id param, otherwise finish if 'next_max_id' in pagination: params['max_id'] = pagination['next_max_id'] else: break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)
def scrape_twitter_relations(id_): """ Fetch friends and followers for the Twitter user identified by `id_`. The number of friends and followers to fetch is configured in Admin. """ redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) max_results = get_config(db, 'max_relations_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_twitter must be an integer' ) friends_results = 0 friends_ids = [] followers_results = 0 followers_ids = [] friends_cursor = -1 followers_cursor = -1 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) params = { 'count': 5000, 'user_id': profile.upstream_id, 'stringify_ids': True, } # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] ## Get friend IDs. friends_url = 'https://api.twitter.com/1.1/friends/ids.json' params['cursor'] = friends_cursor while friends_results < max_results: friends_response = requests.get( friends_url, params=params, proxies=proxies, verify=False, headers=TWITTER_HEADERS ) friends_response.raise_for_status() # Ignore friends already in the db for friend_id in friends_response.json()['ids']: if friend_id not in current_friends_ids: friends_ids.append(friend_id) friends_results += 1 if friends_results == max_results: break friends_cursor = friends_response.json()['next_cursor'] if friends_cursor == 0: break # No more results else: params['cursor'] = friends_cursor # Get follower IDs. followers_url = 'https://api.twitter.com/1.1/followers/ids.json' params['cursor'] = followers_cursor while followers_results < max_results: followers_response = requests.get( followers_url, params=params, proxies=proxies, verify=False, headers=TWITTER_HEADERS, ) followers_response.raise_for_status() # Ignore followers already in the db for follower_id in followers_response.json()['ids']: if follower_id not in current_followers_ids: followers_ids.append(follower_id) followers_results += 1 if followers_results == max_results: break followers_cursor = followers_response.json()['next_cursor'] if followers_cursor == 0: break # No more results else: params['cursor'] = followers_cursor # Get username for each of the friend/follower IDs and create # a relationship in QuickPin. user_ids = [(uid, 'friend') for uid in friends_ids] + \ [(uid, 'follower') for uid in followers_ids] worker.start_job(total=len(user_ids)) chunk_size = 100 for chunk_start in range(0, len(user_ids), chunk_size): chunk_end = chunk_start + chunk_size chunk = user_ids[chunk_start:chunk_end] chunk_lookup = {id_:relation for id_,relation in chunk} lookup_url = 'https://api.twitter.com/1.1/users/lookup.json' lookup_response = requests.post( lookup_url, proxies=_get_proxies(db), verify=False, headers=TWITTER_HEADERS, data={'user_id': ','.join(chunk_lookup.keys())} ) lookup_response.raise_for_status() relations = lookup_response.json() for related_dict in relations: uid = related_dict['id_str'] username = related_dict['screen_name'] related_profile = Profile('twitter', uid, username, is_stub=True) db.add(related_profile) try: db.commit() except IntegrityError: # Already exists: use the existing profile. db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='twitter') \ .filter(Profile.upstream_id==uid) \ .one() _twitter_populate_profile(related_dict, related_profile) relation = chunk_lookup[uid] if relation == 'friend': profile.friends.append(related_profile) else: # relation == 'follower': profile.followers.append(related_profile) db.commit() worker.update_job(current=chunk_end) db.commit() worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def page_string(page_title, content, user=None, initial_tab=None, needs_jquery=False): """Make up a complete page as a string.""" conf = configuration.get_config() page_conf = conf['page'] org_conf = conf['organization'] preamble = page_conf.get('preamble', '') script_file = page_conf['script_file'] script_body = "" if os.path.exists(script_file): with open(script_file) as mfile: script_body = mfile.read() script_text = """<script type="text/javascript">""" + script_body + """</script>\n""" if needs_jquery: script_text += """<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>\n""" motd = "" motd_file = page_conf['motd_file'] if os.path.exists(motd_file): with open(motd_file) as mfile: motd = mfile.read() stylesheet_name = page_conf['stylesheet'] if user and user.stylesheet: user_stylesheet_name = os.path.join(os.path.dirname(stylesheet_name), user.stylesheet + ".css") if os.path.exists(user_stylesheet_name): stylesheet_name = user_stylesheet_name if os.path.exists(stylesheet_name): inline = page_conf['style_inline'] if inline: with open(stylesheet_name) as sf: style_text = '<style type="text/css">' + sf.read() + '</style>' else: style_text = '<link rel="stylesheet" type="text/css" href="' + stylesheet_name + '">' # todo: put the motd into the preamble postamble = page_conf.get('postamble', '') final_setup = """<script type="text/javascript">selectTab('""" + initial_tab + """')</script>""" if initial_tab else "" page_heading = page_title logo = page_conf.get('heading_logo', None) if logo: logo_height = int(page_conf.get('logo_height', "32")) page_heading = T.span[ page_heading, T.a(href=org_conf['home_page'])[T.img(align="right", alt=org_conf['title'], height=logo_height, src=logo)]] footer = T.footer[ T.hr, T.p(class_="the_small_print") ["Produced by the ", T.a(href="https://github.com/hillwithsmallfields/makers/")["makers"], " system. ", "We use ", T.a(href="https://www.djangoproject.com/")["django"], " to handle login and sessions, and that uses a ", T. a(href= "https://docs.djangoproject.com/en/2.1/topics/http/sessions/#using-cookie-based-sessions" )["session cookie"], " and a ", T.a(href="https://docs.djangoproject.com/en/2.1/ref/csrf/" )["CSRF protection cookie"], ". ", "We don't use any other cookies that we are aware of, and we neither sell your data nor give it away."]] return RawHtmlPage( page_title, untemplate.HTML5Doc([ untemplate.safe_unicode(style_text + script_text + preamble), T.body[T.h1[page_heading], content, footer], untemplate.safe_unicode(postamble), untemplate.safe_unicode(final_setup) ], head=T.head[T.title[page_title]])).to_string()
def scrape_instagram_relations(id_): """ Fetch friends and followers for the Instagram user identified by `id_`. The number of friends and followers to fetch is configured in Admin. """ redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) friends_results = 0 followers_results = 0 max_results = get_config(db, 'max_relations_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_instagram must be an integer' ) friends_params = {} followers_params = {} total_results = max_results*2 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] worker.start_job(total=total_results) # Get friend IDs. friends_url = 'https://api.instagram.com/v1/users/{}/follows' \ .format(profile.upstream_id) while friends_results < max_results: # Get friends from Instagram API friends_response = requests.get( friends_url, params=friends_params, proxies=proxies, verify=False ) friends_response.raise_for_status() pagination = friends_response.json()['pagination'] for friend in friends_response.json()['data']: # Only store friends that are not already in db. if friend['id'] not in current_friends_ids: related_profile = Profile( 'instagram', friend['id'], friend['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==friend['id']) \ .one() related_profile.name = friend['full_name'] profile.friends.append(related_profile) friends_results += 1 worker.update_job(current=friends_results) if friends_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: friends_params['cursor'] = pagination['next_cursor'] else: break # No more results # Get follower IDs. followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \ .format(profile.upstream_id) # Get followers from Instagram API while followers_results < max_results: # Get friends from Instagram API followers_response = requests.get( followers_url, params=followers_params, proxies=proxies, verify=False ) followers_response.raise_for_status() pagination = followers_response.json()['pagination'] for follower in followers_response.json()['data']: # Only store followers that are not already in db. if follower['id'] not in current_followers_ids: related_profile = Profile( 'instagram', follower['id'], follower['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==follower['id']) \ .one() related_profile.name = follower['full_name'] profile.followers.append(related_profile) followers_results += 1 worker.update_job(current=friends_results + followers_results) if followers_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: followers_params['cursor'] = pagination['next_cursor'] else: break # No more results worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def set_access_permissions_as_admin(access_permissions): access_permissions.add_role( 'owner', configuration.get_config()['organization']['database'])
def main(): parser = argparse.ArgumentParser() parser.add_argument("-y", "--equipment-types", default="equipment-types.csv") parser.add_argument("-e", "--equipment", default="equipment.csv") parser.add_argument("-m", "--members", default="members.csv") parser.add_argument("-u", "--users", default="users.csv") parser.add_argument("-o", "--owners", default="owners.csv") parser.add_argument("-t", "--trainers", default="trainers.csv") parser.add_argument("-b", "--templates", default="event_templates") parser.add_argument("--delete-existing", action='store_true') parser.add_argument("-v", "--verbose", action='store_true') parser.add_argument("-q", "--quick", action='store_true') parser.add_argument("-x", "--existing", "--no-import", action='store_true') args = parser.parse_args() start_time = time.time() config = configuration.get_config() access_permissions.Access_Permissions.change_access_permissions( set_access_permissions_as_admin) days, slots, order = timeslots.get_slots_conf() print("periods are", slots, "in order", order) if not args.existing: print("importing from spreadsheet files") importer.import0(args) else: database.database_init(config, args.delete_existing) stage_time = time.time() print("import complete, running random user behaviour at", int(stage_time - start_time), "seconds") all_types = equipment_type.Equipment_type.list_equipment_types() green_equipment = equipment_type.Equipment_type.list_equipment_types( 'green') green_templates = [ make_training_event_template(eqty) for eqty in green_equipment ] print("green templates are", green_templates) if not args.existing: random_user_activities(all_types, green_templates, args.verbose) this_time = time.time() print("Completed main random behaviour in", int(this_time - stage_time), "seconds") stage_time = this_time # make sure there are some events going on right now # todo: find why it's failing to create these events, then fix it, then see whether the "current event" code is working everybody = person.Person.list_all_people() n_current = random.randrange(3, 7) print("Creating", n_current, "current events") for _ in range(1, n_current): event_datetime = datetime.now() event_datetime = event_datetime.replace(hour=event_datetime.hour - random.randrange(0, 2), minute=0, second=0, microsecond=0) print("Making current event starting at", event_datetime) setup_random_event(green_templates, event_datetime, [random.choice(green_equipment)._id], [random.choice(everybody)._id], verbose=True) print("There are now", len(timeline.Timeline.present_events().events), "present events") # make sure there are some future events # todo: find why it's failing to create these events, then fix it, then see whether the "future event" code is working n_future = random.randrange(24, 48) print("Creating", n_future, "future events") for _ in range(1, n_future): event_datetime = datetime.now() event_datetime = event_datetime.replace( hour=19, minute=0, second=0, microsecond=0) + timedelta( random.randrange(1, 21)) print("Making future event starting at", event_datetime) setup_random_event(green_templates, event_datetime, [random.choice(green_equipment)._id], [random.choice(everybody)._id], verbose=True) print("There are now", len(timeline.Timeline.future_events().events), "future events and", len(timeline.Timeline.past_events().events), "past events") print("present events are", timeline.Timeline.present_events().events) print("future events are", timeline.Timeline.future_events().events) if not args.quick: print("listing members") all_members = person.Person.list_all_members() for whoever in all_members: show_person("member-pages", whoever) show_equipment_types()
def scrape_instagram_relations(id_): ''' Fetch friends and followers for the Instagram user identified by `id_`. The number of friends and followers to fetch is configured in Admin. ''' redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) friends_results = 0 followers_results = 0 #max_results = _get_max_relations(db)['instagram'] max_results = get_config(db, 'max_relations_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_instagram must be an integer' ) friends_params = {} followers_params = {} total_results = max_results*2 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] worker.start_job(total=total_results) # Get friend IDs. friends_url = 'https://api.instagram.com/v1/users/{}/follows' \ .format(profile.upstream_id) while friends_results < max_results: # Get friends from Instagram API friends_response = requests.get( friends_url, params=friends_params, proxies=proxies, verify=False ) friends_response.raise_for_status() pagination = friends_response.json()['pagination'] for friend in friends_response.json()['data']: # Only store friends that are not already in db. if friend['id'] not in current_friends_ids: related_profile = Profile( 'instagram', friend['id'], friend['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==friend['id']) \ .one() related_profile.name = friend['full_name'] profile.friends.append(related_profile) friends_results += 1 worker.update_job(current=friends_results) if friends_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: friends_params['cursor'] = pagination['next_cursor'] else: break # No more results # Get follower IDs. followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \ .format(profile.upstream_id) # Get followers from Instagram API while followers_results < max_results: # Get friends from Instagram API followers_response = requests.get( followers_url, params=followers_params, proxies=proxies, verify=False ) followers_response.raise_for_status() pagination = followers_response.json()['pagination'] for follower in followers_response.json()['data']: # Only store followers that are not already in db. if follower['id'] not in current_followers_ids: related_profile = Profile( 'instagram', follower['id'], follower['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==follower['id']) \ .one() related_profile.name = follower['full_name'] profile.followers.append(related_profile) followers_results += 1 worker.update_job(current=friends_results + followers_results) if followers_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: followers_params['cursor'] = pagination['next_cursor'] else: break # No more results worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_twitter_posts(id_, recent): """ Fetch tweets for the user identified by id_. Checks tweets already stored in db, and will only fetch older or newer tweets depending on value of the boolean argument 'recent', e.g. recent=True will return recent tweets not already stored in the db. The number of tweets to fetch is configured in the Admin. """ db = worker.get_session() max_results = get_config(db, 'max_posts_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_twitter must be an integer') worker.start_job(total=max_results) redis = worker.get_redis() author = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) results = 0 max_id = None more_results = True count = 200 if author is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get posts currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) url = 'https://api.twitter.com/1.1/statuses/user_timeline.json' params = {'count': count, 'user_id': author.upstream_id} if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: since_id = post_query[0].upstream_id params['since_id'] = str(since_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() -1].upstream_id params['max_id'] = str(max_id) while more_results: response = requests.get( url, params=params, proxies=proxies, verify=False, headers=TWITTER_HEADERS, ) response.raise_for_status() post_ids = list() tweets = response.json() if len(tweets) == 0: more_results = False if len(tweets) < count: more_results = False for tweet in tweets: # Twitter API result set includes the tweet with the max_id/since_id # so ignore it. if tweet['id_str'] != max_id: post = Post( author, tweet['id_str'], dateutil.parser.parse(tweet['created_at']), tweet['text'] ) if tweet['lang'] is not None: post.language = tweet['lang'] if tweet['coordinates'] is not None: post.latitude, post.longitude = tweet['coordinates'] place = tweet['place'] if place is not None: # Set longitude/latitude to the center the of bounding polygon. total_lon = 0 total_lat = 0 num_coords = 0 for lon, lat in place['bounding_box']['coordinates'][0]: total_lon += lon total_lat += lat num_coords += 1 post.longitude = total_lon / num_coords post.latitude = total_lat / num_coords # Set location to string identifying the place. post.location = '{}, {}'.format( place['full_name'], place['country'] ) db.add(post) db.flush() post_ids.append(post.id) # Set the max_id to the last tweet to get the next set of # results max_id = tweet['id_str'] params['max_id'] = max_id results += 1 worker.update_job(current=results) if results == max_results: more_results = False break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)
import model.database import model.equipment_type import model.event import model.machine import model.pages import model.person import model.timeline import model.timeslots import pages.equipment_type_list_page import pages.equipment_type_page import pages.person_page import utils.importer genconf = configuration.get_config() interest_areas = genconf['skill_areas'] evening_timeslots = timeslots.timeslots_to_int([[False, False, True]] * 7) weekend_timeslots = timeslots.timeslots_to_int([[False, False, False]] * 5 + [[True, True, True]] * 2) evening_and_weekend_timeslots = evening_timeslots | weekend_timeslots print("evening_timeslots:", timeslots.timeslots_from_int(evening_timeslots)) print("weekend_timeslots:", timeslots.timeslots_from_int(weekend_timeslots)) print("evening_and_weekend_timeslots:", timeslots.timeslots_from_int(evening_and_weekend_timeslots)) def set_access_permissions_as_admin(access_permissions): access_permissions.add_role(
def scrape_instagram_posts(id_, recent): ''' Fetch instagram posts for the user identified by id_. Checks posts already stored in db, and will only fetch older or newer posts depending on value of the boolean argument 'recent', e.g. recent=True will return recent posts not already stored in the db. The number of posts to fetch is configured in the Admin. ''' redis = worker.get_redis() db = worker.get_session() author = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) max_results = get_config(db, 'max_posts_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_instagram must be an integer') min_id = None more_results = True results = 0 params = {} if author is None: raise ValueError('No profile exists with id={}'.format(id_)) url = 'https://api.instagram.com/v1/users/{}/media/recent' \ .format(author.upstream_id) # Get last post currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) \ if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: min_id = post_query[0].upstream_id params['min_id'] = str(min_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() -1].upstream_id params['max_id'] = str(max_id) worker.start_job(total=max_results) logging.warning('WORKER max results: {}'.format(max_results)) while results < max_results: response = requests.get( url, params=params, proxies=proxies, verify=False ) response.raise_for_status() post_ids = list() response_json = response.json()['data'] pagination = response.json()['pagination'] # Instagram API result includes post with min_id so remove it response_json[:] = [d for d in response_json if d.get('id') != min_id] for gram in response_json: if gram['caption'] is not None: text = gram['caption']['text'] else: text = None post = Post( author, gram['id'], datetime.fromtimestamp(int(gram['created_time'])), text ) if gram['location'] is not None: if 'latitude' in gram['location']: post.latitude = gram['location']['latitude'] post.longitude = gram['location']['longitude'] if 'name' in gram['location']: post.location = gram['location']['name'] if 'street_address' in gram['location']: post.location += ' ' + gram['location']['street_address'] if 'images' in gram: image_url = gram['images']['standard_resolution']['url'] name = os.path.basename(urlparse(image_url).path) img_response = requests.get(image_url, verify=False) mime = img_response.headers['Content-type'] image = img_response.content post.attachments.append(File(name, mime, image)) db.add(post) db.flush() post_ids.append(post.id) worker.update_job(current=results) results += 1 if results == max_results: break # If there are more results, set the max_id param, otherwise finish if 'next_max_id' in pagination: params['max_id'] = pagination['next_max_id'] else: break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)