def test_datetime_to_json_date(self): d1 = datetime(2014, 1, 2, 3, 4, 5, tzinfo=pytz.utc) self.assertEqual(datetime_to_json_date(d1), '2014-01-02T03:04:05.000Z') self.assertEqual(json_date_to_datetime('2014-01-02T03:04:05.000Z'), d1) self.assertEqual(json_date_to_datetime('2014-01-02T03:04:05.000'), d1) tz = pytz.timezone("Africa/Kigali") d2 = tz.localize(datetime(2014, 1, 2, 3, 4, 5)) self.assertEqual(datetime_to_json_date(d2), '2014-01-02T01:04:05.000Z') self.assertEqual(json_date_to_datetime('2014-01-02T01:04:05.000Z'), d2.astimezone(pytz.utc)) self.assertEqual(json_date_to_datetime('2014-01-02T01:04:05.000'), d2.astimezone(pytz.utc))
def test_datetime_to_json_date(self): d1 = datetime(2014, 1, 2, 3, 4, 5, tzinfo=pytz.utc) self.assertEqual(datetime_to_json_date(d1), "2014-01-02T03:04:05.000Z") self.assertEqual(json_date_to_datetime("2014-01-02T03:04:05.000+00:00"), d1) self.assertEqual(json_date_to_datetime("2014-01-02T03:04:05.000Z"), d1) self.assertEqual(json_date_to_datetime("2014-01-02T03:04:05.000"), d1) tz = pytz.timezone("Africa/Kigali") d2 = tz.localize(datetime(2014, 1, 2, 3, 4, 5)) self.assertEqual(datetime_to_json_date(d2), "2014-01-02T01:04:05.000Z") self.assertEqual(json_date_to_datetime("2014-01-02T03:04:05+02:00"), d2) self.assertEqual(json_date_to_datetime("2014-01-02T01:04:05.000Z"), d2) self.assertEqual(json_date_to_datetime("2014-01-02T01:04:05.000"), d2)
def _mark_poll_results_sync_completed(poll, org, latest_synced_obj_time): # update the time for this poll from which we fetch next time cache.set(Poll.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.flow_uuid), latest_synced_obj_time, None) # update the last time the sync happened cache.set(Poll.POLL_RESULTS_LAST_SYNC_TIME_CACHE_KEY % (org.pk, poll.flow_uuid), datetime_to_json_date(timezone.now()), None) # clear the saved cursor cache.delete(Poll.POLL_RESULTS_LAST_PULL_CURSOR % (org.pk, poll.flow_uuid)) # Use redis cache with expiring(in 48 hrs) key to allow other polls task # to sync all polls without hitting the API rate limit cache.set(Poll.POLL_RESULTS_LAST_OTHER_POLLS_SYNCED_CACHE_KEY % (org.id, poll.flow_uuid), datetime_to_json_date(timezone.now()), Poll.POLL_RESULTS_LAST_OTHER_POLLS_SYNCED_CACHE_TIMEOUT)
def populate_poll_poll_date(apps, schema_editor): Poll = apps.get_model("polls", "Poll") Org = apps.get_model("orgs", "Org") agent = getattr(settings, "SITE_API_USER_AGENT", None) host = settings.SITE_API_HOST for org in Org.objects.all(): temba_client = TembaClient(host, org.api_token, user_agent=agent) api_flows = temba_client.get_flows() flows_date = dict() for flow in api_flows: flows_date[flow.uuid] = datetime_to_json_date(flow.created_on) for poll in Poll.objects.filter(org=org): json_date = flows_date.get(poll.flow_uuid, None) if json_date: date = json_date_to_datetime(json_date) else: logger.info( "using created_on for flow_date on poll with id %s" % poll.pk) date = poll.created_on poll.poll_date = date poll.save()
def pull_refresh_task(self): from ureport.utils import datetime_to_json_date now = timezone.now() cache.set(Poll.POLL_PULL_ALL_RESULTS_AFTER_DELETE_FLAG % (self.org_id, self.pk), datetime_to_json_date(now.replace(tzinfo=pytz.utc)), None) Poll.pull_poll_results_task(self)
def pull_refresh_task(self): from ureport.utils import datetime_to_json_date from ureport.polls.tasks import pull_refresh now = timezone.now() cache.set(Poll.POLL_PULL_ALL_RESULTS_AFTER_DELETE_FLAG % (self.org_id, self.pk), datetime_to_json_date(now.replace(tzinfo=pytz.utc)), None) pull_refresh.apply_async((self.pk,), queue='sync')
def _mark_poll_results_sync_completed(poll, org, latest_synced_obj_time): # update the time for this poll from which we fetch next time cache.set(Poll.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.flow_uuid), latest_synced_obj_time, None) # update the last time the sync happened cache.set( Poll.POLL_RESULTS_LAST_SYNC_TIME_CACHE_KEY % (org.pk, poll.flow_uuid), datetime_to_json_date(timezone.now()), None, ) # clear the saved cursor cache.delete(Poll.POLL_RESULTS_LAST_PULL_CURSOR % (org.pk, poll.flow_uuid)) # Use redis cache with expiring(in 48 hrs) key to allow other polls task # to sync all polls without hitting the API rate limit cache.set( Poll.POLL_RESULTS_LAST_OTHER_POLLS_SYNCED_CACHE_KEY % (org.id, poll.flow_uuid), datetime_to_json_date(timezone.now()), Poll.POLL_RESULTS_LAST_OTHER_POLLS_SYNCED_CACHE_TIMEOUT, )
def pull_refresh_task(self): from ureport.utils import datetime_to_json_date from ureport.polls.tasks import pull_refresh now = timezone.now() cache.set( Poll.POLL_PULL_ALL_RESULTS_AFTER_DELETE_FLAG % (self.org_id, self.pk), datetime_to_json_date(now.replace(tzinfo=pytz.utc)), None) pull_refresh.apply_async((self.pk, ), queue='sync')
def fetch_contacts(cls, org, after=None): print "START== Fetching contacts for %s" % org.name reporter_group = org.get_config('reporter_group') temba_client = org.get_temba_client() api_groups = temba_client.get_groups(name=reporter_group) if not api_groups: return seen_uuids = [] group_uuid = None for grp in api_groups: if grp.name.lower() == reporter_group.lower(): group_uuid = grp.uuid break now = timezone.now().replace(tzinfo=pytz.utc) before = now if not after: # consider the after year 2013 after = json_date_to_datetime("2013-01-01T00:00:00.000") while before > after: pager = temba_client.pager() api_contacts = temba_client.get_contacts(before=before, after=after, pager=pager) last_contact_index = len(api_contacts) - 1 for i, contact in enumerate(api_contacts): if i == last_contact_index: before = contact.modified_on if group_uuid in contact.groups: cls.update_or_create_from_temba(org, contact) seen_uuids.append(contact.uuid) if not pager.has_more(): cache.set(cls.CONTACT_LAST_FETCHED_CACHE_KEY % org.pk, datetime_to_json_date(now.replace(tzinfo=pytz.utc)), cls.CONTACT_LAST_FETCHED_CACHE_TIMEOUT) break return seen_uuids
def fetch_flows(self, org): client = self._get_client(org, 2) flows = client.get_flows().all() all_flows = dict() for flow in flows: flow_json = dict() flow_json['uuid'] = flow.uuid flow_json['date_hint'] = flow.created_on.strftime('%Y-%m-%d') flow_json['created_on'] = datetime_to_json_date(flow.created_on) flow_json['name'] = flow.name flow_json['archived'] = flow.archived flow_json['runs'] = flow.runs.active + flow.runs.expired + flow.runs.completed + flow.runs.interrupted flow_json['completed_runs'] = flow.runs.completed all_flows[flow.uuid] = flow_json return all_flows
def fetch_flows(self, org): client = self._get_client(org, 2) flows = client.get_flows().all() all_flows = dict() for flow in flows: flow_json = dict() flow_json["uuid"] = flow.uuid flow_json["date_hint"] = flow.created_on.strftime("%Y-%m-%d") flow_json["created_on"] = datetime_to_json_date(flow.created_on) flow_json["name"] = flow.name flow_json["archived"] = flow.archived flow_json["runs"] = flow.runs.active + flow.runs.expired + flow.runs.completed + flow.runs.interrupted flow_json["completed_runs"] = flow.runs.completed all_flows[flow.uuid] = flow_json return all_flows
def fetch_flows(self, org): client = self._get_client(org, 2) flows = client.get_flows().all() all_flows = dict() for flow in flows: flow_json = dict() flow_json["uuid"] = flow.uuid flow_json["date_hint"] = flow.created_on.strftime("%Y-%m-%d") flow_json["created_on"] = datetime_to_json_date(flow.created_on) flow_json["name"] = flow.name flow_json["archived"] = flow.archived flow_json[ "runs"] = flow.runs.active + flow.runs.expired + flow.runs.completed + flow.runs.interrupted flow_json["completed_runs"] = flow.runs.completed flow_json["results"] = [{ "key": elt.key, "name": elt.name, "categories": elt.categories, "node_uuids": elt.node_uuids } for elt in flow.results] all_flows[flow.uuid] = flow_json return all_flows
def populate_poll_poll_date(apps, schema_editor): Poll = apps.get_model('polls', "Poll") Org = apps.get_model('orgs', "Org") agent = getattr(settings, 'SITE_API_USER_AGENT', None) host = settings.SITE_API_HOST for org in Org.objects.all(): temba_client = TembaClient(host, org.api_token, user_agent=agent) api_flows = temba_client.get_flows() flows_date = dict() for flow in api_flows: flows_date[flow.uuid] = datetime_to_json_date(flow.created_on) for poll in Poll.objects.filter(org=org): json_date = flows_date.get(poll.flow_uuid, None) if json_date: date = json_date_to_datetime(json_date) else: print "using created_on for flow_date on poll with id %s" % poll.pk date = poll.created_on poll.poll_date = date poll.save()
def pull_results(self, poll, modified_after, modified_before, progress_callback=None): org = poll.org r = get_redis_connection() key = Poll.POLL_PULL_RESULTS_TASK_LOCK % (org.pk, poll.flow_uuid) num_val_created = 0 num_val_updated = 0 num_val_ignored = 0 num_path_created = 0 num_path_updated = 0 num_path_ignored = 0 num_synced = 0 if r.get(key): print "Skipping pulling results for poll #%d on org #%d as it is still running" % (poll.pk, org.pk) else: with r.lock(key, timeout=Poll.POLL_SYNC_LOCK_TIMEOUT): client = self._get_client(org, 2) questions_uuids = poll.get_question_uuids() # ignore the TaskState time and use the time we stored in redis (after, before, latest_synced_obj_time, batches_latest, resume_cursor, pull_after_delete) = poll.get_pull_cached_params() if resume_cursor is None: before = datetime_to_json_date(timezone.now()) after = latest_synced_obj_time if pull_after_delete is not None: after = None poll.delete_poll_results() start = time.time() print "Start fetching runs for poll #%d on org #%d" % (poll.pk, org.pk) poll_runs_query = client.get_runs(flow=poll.flow_uuid, after=after, before=before) fetches = poll_runs_query.iterfetches(retry_on_rate_exceed=True, resume_cursor=resume_cursor) fetch_start = time.time() for fetch in fetches: print "RapidPro API fetch for poll #%d on org #%d %d - %d took %ds" % (poll.pk, org.pk, num_synced, num_synced + len(fetch), time.time() - fetch_start) contact_uuids = [run.contact.uuid for run in fetch] contacts = Contact.objects.filter(org=org, uuid__in=contact_uuids) contacts_map = {c.uuid: c for c in contacts} existing_poll_results = PollResult.objects.filter(flow=poll.flow_uuid, org=poll.org_id, contact__in=contact_uuids) poll_results_map = defaultdict(dict) for res in existing_poll_results: poll_results_map[res.contact][res.ruleset] = res poll_results_to_save_map = defaultdict(dict) for temba_run in fetch: if batches_latest is None or temba_run.modified_on > json_date_to_datetime(batches_latest): batches_latest = datetime_to_json_date(temba_run.modified_on.replace(tzinfo=pytz.utc)) flow_uuid = temba_run.flow.uuid contact_uuid = temba_run.contact.uuid completed = temba_run.exit_type == 'completed' contact_obj = contacts_map.get(contact_uuid, None) state = '' district = '' ward = '' born = None gender = None if contact_obj is not None: state = contact_obj.state district = contact_obj.district ward = contact_obj.ward born = contact_obj.born gender = contact_obj.gender temba_values = temba_run.values.values() temba_values.sort(key=lambda val: val.time) for temba_value in temba_values: ruleset_uuid = temba_value.node category = temba_value.category text = temba_value.value value_date = temba_value.time existing_poll_result = poll_results_map.get(contact_uuid, dict()).get(ruleset_uuid, None) poll_result_to_save = poll_results_to_save_map.get(contact_uuid, dict()).get(ruleset_uuid, None) if existing_poll_result is not None: update_required = existing_poll_result.category != category or existing_poll_result.text != text update_required = update_required or existing_poll_result.state != state update_required = update_required or existing_poll_result.district != district update_required = update_required or existing_poll_result.ward != ward update_required = update_required or existing_poll_result.born != born update_required = update_required or existing_poll_result.gender != gender update_required = update_required or existing_poll_result.completed != completed # if the reporter answered the step, check if this is a newer run if existing_poll_result.date is not None: update_required = update_required and (value_date > existing_poll_result.date) else: update_required = True if update_required: # update the db object PollResult.objects.filter(pk=existing_poll_result.pk).update(category=category, text=text, state=state, district=district, ward=ward, date=value_date, born=born, gender=gender, completed=completed) # update the map object as well existing_poll_result.category = category existing_poll_result.text = text existing_poll_result.state = state existing_poll_result.district = district existing_poll_result.ward = ward existing_poll_result.date = value_date existing_poll_result.born = born existing_poll_result.gender = gender existing_poll_result.completed = completed poll_results_map[contact_uuid][ruleset_uuid] = existing_poll_result num_val_updated += 1 else: num_val_ignored += 1 elif poll_result_to_save is not None: replace_save_map = poll_result_to_save.category != category or poll_result_to_save.text != text replace_save_map = replace_save_map or poll_result_to_save.state != state replace_save_map = replace_save_map or poll_result_to_save.district != district replace_save_map = replace_save_map or poll_result_to_save.ward != ward replace_save_map = replace_save_map or poll_result_to_save.born != born replace_save_map = replace_save_map or poll_result_to_save.gender != gender replace_save_map = replace_save_map or poll_result_to_save.completed != completed # replace if the step is newer if poll_result_to_save.date is not None: replace_save_map = replace_save_map and (value_date > poll_result_to_save.date) if replace_save_map: result_obj = PollResult(org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ruleset_uuid] = result_obj num_val_ignored += 1 else: result_obj = PollResult(org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ruleset_uuid] = result_obj num_val_created += 1 for temba_path in temba_run.path: ruleset_uuid = temba_path.node category = None text = "" value_date = temba_path.time if ruleset_uuid in questions_uuids: existing_poll_result = poll_results_map.get(contact_uuid, dict()).get(ruleset_uuid, None) poll_result_to_save = poll_results_to_save_map.get(contact_uuid, dict()).get(ruleset_uuid, None) if existing_poll_result is not None: if existing_poll_result.date is None or value_date > existing_poll_result.date: # update the db object PollResult.objects.filter(pk=existing_poll_result.pk).update(category=category, text=text, state=state, district=district, ward=ward, date=value_date, born=born, gender=gender, completed=completed) # update the map object as well existing_poll_result.category = category existing_poll_result.text = text existing_poll_result.state = state existing_poll_result.district = district existing_poll_result.ward = ward existing_poll_result.date = value_date existing_poll_result.born = born existing_poll_result.gender = gender existing_poll_result.completed = completed poll_results_map[contact_uuid][ruleset_uuid] = existing_poll_result num_path_updated += 1 else: num_path_ignored += 1 elif poll_result_to_save is not None: if value_date > poll_result_to_save.date: result_obj = PollResult(org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ruleset_uuid] = result_obj num_path_ignored += 1 else: result_obj = PollResult(org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ruleset_uuid] = result_obj num_path_created += 1 else: num_path_ignored += 1 num_synced += len(fetch) if progress_callback: progress_callback(num_synced) new_poll_results = [] for c_key in poll_results_to_save_map.keys(): for r_key in poll_results_to_save_map.get(c_key, dict()): obj_to_create = poll_results_to_save_map.get(c_key, dict()).get(r_key, None) if obj_to_create is not None: new_poll_results.append(obj_to_create) PollResult.objects.bulk_create(new_poll_results) print "Processed fetch of %d - %d runs for poll #%d on org #%d" % (num_synced - len(fetch), num_synced, poll.pk, org.pk) fetch_start = time.time() print "=" * 40 if num_synced >= Poll.POLL_RESULTS_MAX_SYNC_RUNS: poll.rebuild_poll_results_counts() cursor = fetches.get_cursor() cache.set(Poll.POLL_RESULTS_LAST_PULL_CURSOR % (org.pk, poll.flow_uuid), cursor, None) cache.set(Poll.POLL_RESULTS_CURSOR_AFTER_CACHE_KEY % (org.pk, poll.flow_uuid), after, None) cache.set(Poll.POLL_RESULTS_CURSOR_BEFORE_CACHE_KEY % (org.pk, poll.flow_uuid), before, None) cache.set(Poll.POLL_RESULTS_BATCHES_LATEST_CACHE_KEY % (org.pk, poll.flow_uuid), batches_latest, None) print "Break pull results for poll #%d on org #%d in %ds, "\ " Times: after= %s, before= %s, batch_latest= %s, sync_latest= %s"\ " Objects: created %d, updated %d, ignored %d. " \ "Before cursor %s" % (poll.pk, org.pk, time.time() - start, after, before, batches_latest, latest_synced_obj_time, num_val_created, num_val_updated, num_val_ignored, cursor) return (num_val_created, num_val_updated, num_val_ignored, num_path_created, num_path_updated, num_path_ignored) if batches_latest is not None and (latest_synced_obj_time is None or json_date_to_datetime(latest_synced_obj_time) <= json_date_to_datetime(batches_latest)): latest_synced_obj_time = batches_latest # update the time for this poll from which we fetch next time cache.set(Poll.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.flow_uuid), latest_synced_obj_time, None) # clear the saved cursor cache.delete(Poll.POLL_RESULTS_LAST_PULL_CURSOR % (org.pk, poll.flow_uuid)) # from django.db import connection as db_connection, reset_queries # slowest_queries = sorted(db_connection.queries, key=lambda q: q['time'], reverse=True)[:10] # for q in slowest_queries: # print "=" * 60 # print "\n\n\n" # print "%s -- %s" % (q['time'], q['sql']) # reset_queries() print "Finished pulling results for poll #%d on org #%d runs in %ds, " \ "Times: sync_latest= %s," \ "Objects: created %d, updated %d, ignored %d" % (poll.pk, org.pk, time.time() - start, latest_synced_obj_time, num_val_created, num_val_updated, num_val_ignored) return num_val_created, num_val_updated, num_val_ignored, num_path_created, num_path_updated, num_path_ignored
def pull_results(self, poll, modified_after, modified_before, progress_callback=None): org = poll.org r = get_redis_connection() key = Poll.POLL_PULL_RESULTS_TASK_LOCK % (org.pk, poll.flow_uuid) stats_dict = dict( num_val_created=0, num_val_updated=0, num_val_ignored=0, num_path_created=0, num_path_updated=0, num_path_ignored=0, num_synced=0, ) if r.get(key): logger.info("Skipping pulling results for poll #%d on org #%d as it is still running" % (poll.pk, org.pk)) else: with r.lock(key, timeout=Poll.POLL_SYNC_LOCK_TIMEOUT): lock_expiration = time.time() + 0.8 * Poll.POLL_SYNC_LOCK_TIMEOUT poll_results_url = "https://go.votomobile.org/flow-results/packages/%s/responses" % poll.flow_uuid headers = { "Content-type": "application/json", "Accept": "application/json", "Authorization": "Token %s" % self.backend.api_token, } results = [] questions_uuids = poll.get_question_uuids() # ignore the TaskState time and use the time we stored in redis ( after, before, latest_synced_obj_time, batches_latest, resume_cursor, pull_after_delete, ) = poll.get_pull_cached_params() if pull_after_delete is not None: after = None latest_synced_obj_time = None batches_latest = None resume_cursor = None poll.delete_poll_results() if resume_cursor is None: before = datetime_to_json_date(timezone.now()) after = latest_synced_obj_time start = time.time() logger.info("Start fetching runs for poll #%d on org #%d" % (poll.pk, org.pk)) params = dict( filter={"end-timestamp": before, "start-timestamp": after}, page={"beforeCursor": resume_cursor} ) while poll_results_url: response = requests.request("GET", poll_results_url, headers=headers, params=params) response_json = response.json() results = response_json["data"]["attributes"]["responses"] poll_results_url = response_json["data"]["relationships"]["links"]["next"] contacts_map, poll_results_map, poll_results_to_save_map = self._initiate_lookup_maps( results, org, poll ) for result in results: if batches_latest is None or json_date_to_datetime(result[0]) > json_date_to_datetime( batches_latest ): batches_latest = result[0] contact_obj = contacts_map.get(result[2], None) self._process_run_poll_results( org, poll.flow_uuid, questions_uuids, result, contact_obj, poll_results_map, poll_results_to_save_map, stats_dict, ) stats_dict["num_synced"] += len(results) if progress_callback: progress_callback(stats_dict["num_synced"]) self._save_new_poll_results_to_database(poll_results_to_save_map) logger.info( "Processed fetch of %d - %d " "runs for poll #%d on org #%d" % (stats_dict["num_synced"] - len(results), stats_dict["num_synced"], poll.pk, org.pk) ) # fetch_start = time.time() logger.info("=" * 40) if stats_dict["num_synced"] >= Poll.POLL_RESULTS_MAX_SYNC_RUNS or time.time() > lock_expiration: poll.rebuild_poll_results_counts() cursor = result[1] self._mark_poll_results_sync_paused(org, poll, cursor, after, before, batches_latest) logger.info( "Break pull results for poll #%d on org #%d in %ds, " " Times: after= %s, before= %s, batch_latest= %s, sync_latest= %s" " Objects: created %d, updated %d, ignored %d. " "Before cursor %s" % ( poll.pk, org.pk, time.time() - start, after, before, batches_latest, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], cursor, ) ) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], ) if batches_latest is not None and ( latest_synced_obj_time is None or json_date_to_datetime(latest_synced_obj_time) <= json_date_to_datetime(batches_latest) ): latest_synced_obj_time = batches_latest self._mark_poll_results_sync_completed(poll, org, latest_synced_obj_time) # from django.db import connection as db_connection, reset_queries # slowest_queries = sorted(db_connection.queries, key=lambda q: q['time'], reverse=True)[:10] # for q in slowest_queries: # print "=" * 60 # print "\n\n\n" # print "%s -- %s" % (q['time'], q['sql']) # reset_queries() logger.info( "Finished pulling results for poll #%d on org #%d runs in %ds, " "Times: sync_latest= %s," "Objects: created %d, updated %d, ignored %d" % ( poll.pk, org.pk, time.time() - start, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], ) ) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], )
def pull_results(self, poll, modified_after, modified_before, progress_callback=None): org = poll.org r = get_redis_connection() key = Poll.POLL_PULL_RESULTS_TASK_LOCK % (org.pk, poll.flow_uuid) num_val_created = 0 num_val_updated = 0 num_val_ignored = 0 num_path_created = 0 num_path_updated = 0 num_path_ignored = 0 num_synced = 0 if r.get(key): print "Skipping pulling results for poll #%d on org #%d as it is still running" % ( poll.pk, org.pk) else: with r.lock(key, timeout=Poll.POLL_SYNC_LOCK_TIMEOUT): client = self._get_client(org, 2) questions_uuids = poll.get_question_uuids() # ignore the TaskState time and use the time we stored in redis (after, before, latest_synced_obj_time, batches_latest, resume_cursor, pull_after_delete) = poll.get_pull_cached_params() if resume_cursor is None: before = datetime_to_json_date(timezone.now()) after = latest_synced_obj_time if pull_after_delete is not None: after = None poll.delete_poll_results() start = time.time() print "Start fetching runs for poll #%d on org #%d" % (poll.pk, org.pk) poll_runs_query = client.get_runs(flow=poll.flow_uuid, after=after, before=before) fetches = poll_runs_query.iterfetches( retry_on_rate_exceed=True, resume_cursor=resume_cursor) fetch_start = time.time() for fetch in fetches: print "RapidPro API fetch for poll #%d on org #%d %d - %d took %ds" % ( poll.pk, org.pk, num_synced, num_synced + len(fetch), time.time() - fetch_start) contact_uuids = [run.contact.uuid for run in fetch] contacts = Contact.objects.filter(org=org, uuid__in=contact_uuids) contacts_map = {c.uuid: c for c in contacts} existing_poll_results = PollResult.objects.filter( flow=poll.flow_uuid, org=poll.org_id, contact__in=contact_uuids) poll_results_map = defaultdict(dict) for res in existing_poll_results: poll_results_map[res.contact][res.ruleset] = res poll_results_to_save_map = defaultdict(dict) for temba_run in fetch: if batches_latest is None or temba_run.modified_on > json_date_to_datetime( batches_latest): batches_latest = datetime_to_json_date( temba_run.modified_on.replace(tzinfo=pytz.utc)) flow_uuid = temba_run.flow.uuid contact_uuid = temba_run.contact.uuid completed = temba_run.exit_type == 'completed' contact_obj = contacts_map.get(contact_uuid, None) state = '' district = '' ward = '' born = None gender = None if contact_obj is not None: state = contact_obj.state district = contact_obj.district ward = contact_obj.ward born = contact_obj.born gender = contact_obj.gender temba_values = temba_run.values.values() temba_values.sort(key=lambda val: val.time) for temba_value in temba_values: ruleset_uuid = temba_value.node category = temba_value.category text = temba_value.value value_date = temba_value.time existing_poll_result = poll_results_map.get( contact_uuid, dict()).get(ruleset_uuid, None) poll_result_to_save = poll_results_to_save_map.get( contact_uuid, dict()).get(ruleset_uuid, None) if existing_poll_result is not None: update_required = existing_poll_result.category != category or existing_poll_result.text != text update_required = update_required or existing_poll_result.state != state update_required = update_required or existing_poll_result.district != district update_required = update_required or existing_poll_result.ward != ward update_required = update_required or existing_poll_result.born != born update_required = update_required or existing_poll_result.gender != gender update_required = update_required or existing_poll_result.completed != completed # if the reporter answered the step, check if this is a newer run if existing_poll_result.date is not None: update_required = update_required and ( value_date > existing_poll_result.date) else: update_required = True if update_required: # update the db object PollResult.objects.filter( pk=existing_poll_result.pk).update( category=category, text=text, state=state, district=district, ward=ward, date=value_date, born=born, gender=gender, completed=completed) # update the map object as well existing_poll_result.category = category existing_poll_result.text = text existing_poll_result.state = state existing_poll_result.district = district existing_poll_result.ward = ward existing_poll_result.date = value_date existing_poll_result.born = born existing_poll_result.gender = gender existing_poll_result.completed = completed poll_results_map[contact_uuid][ ruleset_uuid] = existing_poll_result num_val_updated += 1 else: num_val_ignored += 1 elif poll_result_to_save is not None: replace_save_map = poll_result_to_save.category != category or poll_result_to_save.text != text replace_save_map = replace_save_map or poll_result_to_save.state != state replace_save_map = replace_save_map or poll_result_to_save.district != district replace_save_map = replace_save_map or poll_result_to_save.ward != ward replace_save_map = replace_save_map or poll_result_to_save.born != born replace_save_map = replace_save_map or poll_result_to_save.gender != gender replace_save_map = replace_save_map or poll_result_to_save.completed != completed # replace if the step is newer if poll_result_to_save.date is not None: replace_save_map = replace_save_map and ( value_date > poll_result_to_save.date) if replace_save_map: result_obj = PollResult( org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ ruleset_uuid] = result_obj num_val_ignored += 1 else: result_obj = PollResult(org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ ruleset_uuid] = result_obj num_val_created += 1 for temba_path in temba_run.path: ruleset_uuid = temba_path.node category = None text = "" value_date = temba_path.time if ruleset_uuid in questions_uuids: existing_poll_result = poll_results_map.get( contact_uuid, dict()).get(ruleset_uuid, None) poll_result_to_save = poll_results_to_save_map.get( contact_uuid, dict()).get(ruleset_uuid, None) if existing_poll_result is not None: if existing_poll_result.date is None or value_date > existing_poll_result.date: # update the db object PollResult.objects.filter( pk=existing_poll_result.pk).update( category=category, text=text, state=state, district=district, ward=ward, date=value_date, born=born, gender=gender, completed=completed) # update the map object as well existing_poll_result.category = category existing_poll_result.text = text existing_poll_result.state = state existing_poll_result.district = district existing_poll_result.ward = ward existing_poll_result.date = value_date existing_poll_result.born = born existing_poll_result.gender = gender existing_poll_result.completed = completed poll_results_map[contact_uuid][ ruleset_uuid] = existing_poll_result num_path_updated += 1 else: num_path_ignored += 1 elif poll_result_to_save is not None: if value_date > poll_result_to_save.date: result_obj = PollResult( org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ ruleset_uuid] = result_obj num_path_ignored += 1 else: result_obj = PollResult( org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, born=born, gender=gender, date=value_date, completed=completed) poll_results_to_save_map[contact_uuid][ ruleset_uuid] = result_obj num_path_created += 1 else: num_path_ignored += 1 num_synced += len(fetch) if progress_callback: progress_callback(num_synced) new_poll_results = [] for c_key in poll_results_to_save_map.keys(): for r_key in poll_results_to_save_map.get( c_key, dict()): obj_to_create = poll_results_to_save_map.get( c_key, dict()).get(r_key, None) if obj_to_create is not None: new_poll_results.append(obj_to_create) PollResult.objects.bulk_create(new_poll_results) print "Processed fetch of %d - %d runs for poll #%d on org #%d" % ( num_synced - len(fetch), num_synced, poll.pk, org.pk) fetch_start = time.time() print "=" * 40 if num_synced >= Poll.POLL_RESULTS_MAX_SYNC_RUNS: poll.rebuild_poll_results_counts() cursor = fetches.get_cursor() cache.set( Poll.POLL_RESULTS_LAST_PULL_CURSOR % (org.pk, poll.flow_uuid), cursor, None) cache.set( Poll.POLL_RESULTS_CURSOR_AFTER_CACHE_KEY % (org.pk, poll.flow_uuid), after, None) cache.set( Poll.POLL_RESULTS_CURSOR_BEFORE_CACHE_KEY % (org.pk, poll.flow_uuid), before, None) cache.set( Poll.POLL_RESULTS_BATCHES_LATEST_CACHE_KEY % (org.pk, poll.flow_uuid), batches_latest, None) print "Break pull results for poll #%d on org #%d in %ds, "\ " Times: after= %s, before= %s, batch_latest= %s, sync_latest= %s"\ " Objects: created %d, updated %d, ignored %d. " \ "Before cursor %s" % (poll.pk, org.pk, time.time() - start, after, before, batches_latest, latest_synced_obj_time, num_val_created, num_val_updated, num_val_ignored, cursor) return (num_val_created, num_val_updated, num_val_ignored, num_path_created, num_path_updated, num_path_ignored) if batches_latest is not None and ( latest_synced_obj_time is None or json_date_to_datetime(latest_synced_obj_time) <= json_date_to_datetime(batches_latest)): latest_synced_obj_time = batches_latest # update the time for this poll from which we fetch next time cache.set( Poll.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.flow_uuid), latest_synced_obj_time, None) # clear the saved cursor cache.delete(Poll.POLL_RESULTS_LAST_PULL_CURSOR % (org.pk, poll.flow_uuid)) # from django.db import connection as db_connection, reset_queries # slowest_queries = sorted(db_connection.queries, key=lambda q: q['time'], reverse=True)[:10] # for q in slowest_queries: # print "=" * 60 # print "\n\n\n" # print "%s -- %s" % (q['time'], q['sql']) # reset_queries() print "Finished pulling results for poll #%d on org #%d runs in %ds, " \ "Times: sync_latest= %s," \ "Objects: created %d, updated %d, ignored %d" % (poll.pk, org.pk, time.time() - start, latest_synced_obj_time, num_val_created, num_val_updated, num_val_ignored) return num_val_created, num_val_updated, num_val_ignored, num_path_created, num_path_updated, num_path_ignored
def pull_contacts(org, ignored_since, ignored_until): """ Fetches updated contacts from RapidPro and updates local contacts accordingly """ from ureport.contacts.models import ReportersCounter results = dict() backends = org.backends.filter(is_active=True) for backend_obj in backends: backend = org.get_backend(backend_slug=backend_obj.slug) last_fetch_date_key = Contact.CONTACT_LAST_FETCHED_CACHE_KEY % (org.pk, backend_obj.slug) until = datetime_to_json_date(timezone.now()) since = cache.get(last_fetch_date_key, None) if not since: logger.info("First time run for org #%d. Will sync all contacts" % org.pk) start = time.time() fields_created, fields_updated, fields_deleted, ignored = backend.pull_fields(org) logger.info( "Fetched contact fields for org #%d. " "Created %s, Updated %s, Deleted %d, Ignored %d" % (org.pk, fields_created, fields_updated, fields_deleted, ignored) ) logger.info("Fetch fields for org #%d took %ss" % (org.pk, time.time() - start)) start_boundaries = time.time() boundaries_created, boundaries_updated, boundaries_deleted, ignored = backend.pull_boundaries(org) logger.info( "Fetched boundaries for org #%d. " "Created %s, Updated %s, Deleted %d, Ignored %d" % (org.pk, boundaries_created, boundaries_updated, boundaries_deleted, ignored) ) logger.info("Fetch boundaries for org #%d took %ss" % (org.pk, time.time() - start_boundaries)) start_contacts = time.time() contacts_created, contacts_updated, contacts_deleted, ignored = backend.pull_contacts(org, since, until) cache.set(last_fetch_date_key, until, None) logger.info( "Fetched contacts for org #%d. " "Created %s, Updated %s, Deleted %d, Ignored %d" % (org.pk, contacts_created, contacts_updated, contacts_deleted, ignored) ) logger.info("Fetch contacts for org #%d took %ss" % (org.pk, time.time() - start_contacts)) # Squash reporters counts ReportersCounter.squash_counts() results[backend_obj.slug] = { "fields": {"created": fields_created, "updated": fields_updated, "deleted": fields_deleted}, "boundaries": { "created": boundaries_created, "updated": boundaries_updated, "deleted": boundaries_deleted, }, "contacts": {"created": contacts_created, "updated": contacts_updated, "deleted": contacts_deleted}, } return results
def pull_results(self, poll, modified_after, modified_before, progress_callback=None): org = poll.org r = get_redis_connection() key = Poll.POLL_PULL_RESULTS_TASK_LOCK % (org.pk, poll.flow_uuid) stats_dict = dict( num_val_created=0, num_val_updated=0, num_val_ignored=0, num_path_created=0, num_path_updated=0, num_path_ignored=0, num_synced=0, ) if r.get(key): logger.info("Skipping pulling results for poll #%d on org #%d as it is still running" % (poll.pk, org.pk)) else: with r.lock(key, timeout=Poll.POLL_SYNC_LOCK_TIMEOUT): lock_expiration = time.time() + 0.8 * Poll.POLL_SYNC_LOCK_TIMEOUT client = self._get_client(org, 2) questions_uuids = poll.get_question_uuids() # ignore the TaskState time and use the time we stored in redis ( after, before, latest_synced_obj_time, batches_latest, resume_cursor, pull_after_delete, ) = poll.get_pull_cached_params() if pull_after_delete is not None: after = None latest_synced_obj_time = None batches_latest = None resume_cursor = None poll.delete_poll_results() pull_refresh_from_archives.apply_async((poll.pk,), queue="sync") if resume_cursor is None: before = datetime_to_json_date(timezone.now()) after = latest_synced_obj_time start = time.time() logger.info("Start fetching runs for poll #%d on org #%d" % (poll.pk, org.pk)) poll_runs_query = client.get_runs(flow=poll.flow_uuid, after=after, before=before) fetches = poll_runs_query.iterfetches(retry_on_rate_exceed=True, resume_cursor=resume_cursor) try: fetch_start = time.time() for fetch in fetches: logger.info( "RapidPro API fetch for poll #%d " "on org #%d %d - %d took %ds" % ( poll.pk, org.pk, stats_dict["num_synced"], stats_dict["num_synced"] + len(fetch), time.time() - fetch_start, ) ) contacts_map, poll_results_map, poll_results_to_save_map = self._initiate_lookup_maps( fetch, org, poll ) for temba_run in fetch: if batches_latest is None or temba_run.modified_on > json_date_to_datetime(batches_latest): batches_latest = datetime_to_json_date(temba_run.modified_on.replace(tzinfo=pytz.utc)) contact_obj = contacts_map.get(temba_run.contact.uuid, None) self._process_run_poll_results( org, questions_uuids, temba_run, contact_obj, poll_results_map, poll_results_to_save_map, stats_dict, ) stats_dict["num_synced"] += len(fetch) if progress_callback: progress_callback(stats_dict["num_synced"]) self._save_new_poll_results_to_database(poll_results_to_save_map) logger.info( "Processed fetch of %d - %d " "runs for poll #%d on org #%d" % (stats_dict["num_synced"] - len(fetch), stats_dict["num_synced"], poll.pk, org.pk) ) fetch_start = time.time() logger.info("=" * 40) if ( stats_dict["num_synced"] >= Poll.POLL_RESULTS_MAX_SYNC_RUNS or time.time() > lock_expiration ): poll.rebuild_poll_results_counts() cursor = fetches.get_cursor() self._mark_poll_results_sync_paused(org, poll, cursor, after, before, batches_latest) logger.info( "Break pull results for poll #%d on org #%d in %ds, " " Times: after= %s, before= %s, batch_latest= %s, sync_latest= %s" " Objects: created %d, updated %d, ignored %d. " "Before cursor %s" % ( poll.pk, org.pk, time.time() - start, after, before, batches_latest, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], cursor, ) ) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], ) except TembaRateExceededError: poll.rebuild_poll_results_counts() cursor = fetches.get_cursor() self._mark_poll_results_sync_paused(org, poll, cursor, after, before, batches_latest) logger.info( "Break pull results for poll #%d on org #%d in %ds, " " Times: after= %s, before= %s, batch_latest= %s, sync_latest= %s" " Objects: created %d, updated %d, ignored %d. " "Before cursor %s" % ( poll.pk, org.pk, time.time() - start, after, before, batches_latest, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], cursor, ) ) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], ) if batches_latest is not None and ( latest_synced_obj_time is None or json_date_to_datetime(latest_synced_obj_time) <= json_date_to_datetime(batches_latest) ): latest_synced_obj_time = batches_latest self._mark_poll_results_sync_completed(poll, org, latest_synced_obj_time) # from django.db import connection as db_connection, reset_queries # slowest_queries = sorted(db_connection.queries, key=lambda q: q['time'], reverse=True)[:10] # for q in slowest_queries: # print "=" * 60 # print "\n\n\n" # print "%s -- %s" % (q['time'], q['sql']) # reset_queries() logger.info( "Finished pulling results for poll #%d on org #%d runs in %ds, " "Times: sync_latest= %s," "Objects: created %d, updated %d, ignored %d" % ( poll.pk, org.pk, time.time() - start, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], ) ) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], )
def pull_results(self, poll, modified_after, modified_before, progress_callback=None): org = poll.org r = get_redis_connection() key = Poll.POLL_PULL_RESULTS_TASK_LOCK % (org.pk, poll.flow_uuid) stats_dict = dict( num_val_created=0, num_val_updated=0, num_val_ignored=0, num_path_created=0, num_path_updated=0, num_path_ignored=0, num_synced=0, ) if poll.stopped_syncing: return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], ) if r.get(key): logger.info( "Skipping pulling results for poll #%d on org #%d as it is still running" % (poll.pk, org.pk)) else: with r.lock(key, timeout=Poll.POLL_SYNC_LOCK_TIMEOUT): lock_expiration = time.time( ) + 0.8 * Poll.POLL_SYNC_LOCK_TIMEOUT client = self._get_client(org, 2) questions_uuids = poll.get_question_uuids() # ignore the TaskState time and use the time we stored in redis ( after, before, latest_synced_obj_time, batches_latest, resume_cursor, pull_after_delete, ) = poll.get_pull_cached_params() if pull_after_delete is not None: after = None latest_synced_obj_time = None batches_latest = None resume_cursor = None poll.delete_poll_results() pull_refresh_from_archives.apply_async((poll.pk, ), queue="sync") if resume_cursor is None: before = datetime_to_json_date(timezone.now()) after = latest_synced_obj_time start = time.time() logger.info("Start fetching runs for poll #%d on org #%d" % (poll.pk, org.pk)) poll_runs_query = client.get_runs(flow=poll.flow_uuid, after=after, before=before) fetches = poll_runs_query.iterfetches( retry_on_rate_exceed=True, resume_cursor=resume_cursor) try: fetch_start = time.time() for fetch in fetches: logger.info("RapidPro API fetch for poll #%d " "on org #%d %d - %d took %ds" % ( poll.pk, org.pk, stats_dict["num_synced"], stats_dict["num_synced"] + len(fetch), time.time() - fetch_start, )) contacts_map, poll_results_map, poll_results_to_save_map = self._initiate_lookup_maps( fetch, org, poll) for temba_run in fetch: if batches_latest is None or temba_run.modified_on > json_date_to_datetime( batches_latest): batches_latest = datetime_to_json_date( temba_run.modified_on.replace( tzinfo=pytz.utc)) contact_obj = contacts_map.get( temba_run.contact.uuid, None) self._process_run_poll_results( org, questions_uuids, temba_run, contact_obj, poll_results_map, poll_results_to_save_map, stats_dict, ) stats_dict["num_synced"] += len(fetch) if progress_callback: progress_callback(stats_dict["num_synced"]) self._save_new_poll_results_to_database( poll_results_to_save_map) logger.info( "Processed fetch of %d - %d " "runs for poll #%d on org #%d" % (stats_dict["num_synced"] - len(fetch), stats_dict["num_synced"], poll.pk, org.pk)) fetch_start = time.time() logger.info("=" * 40) if (stats_dict["num_synced"] >= Poll.POLL_RESULTS_MAX_SYNC_RUNS or time.time() > lock_expiration): poll.rebuild_poll_results_counts() cursor = fetches.get_cursor() self._mark_poll_results_sync_paused( org, poll, cursor, after, before, batches_latest) logger.info( "Break pull results for poll #%d on org #%d in %ds, " " Times: after= %s, before= %s, batch_latest= %s, sync_latest= %s" " Objects: created %d, updated %d, ignored %d. " "Before cursor %s" % ( poll.pk, org.pk, time.time() - start, after, before, batches_latest, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], cursor, )) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], ) except TembaRateExceededError: poll.rebuild_poll_results_counts() cursor = fetches.get_cursor() self._mark_poll_results_sync_paused( org, poll, cursor, after, before, batches_latest) logger.info( "Break pull results for poll #%d on org #%d in %ds, " " Times: after= %s, before= %s, batch_latest= %s, sync_latest= %s" " Objects: created %d, updated %d, ignored %d. " "Before cursor %s" % ( poll.pk, org.pk, time.time() - start, after, before, batches_latest, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], cursor, )) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], ) if batches_latest is not None and ( latest_synced_obj_time is None or json_date_to_datetime(latest_synced_obj_time) <= json_date_to_datetime(batches_latest)): latest_synced_obj_time = batches_latest self._mark_poll_results_sync_completed(poll, org, latest_synced_obj_time) # from django.db import connection as db_connection, reset_queries # slowest_queries = sorted(db_connection.queries, key=lambda q: q['time'], reverse=True)[:10] # for q in slowest_queries: # print "=" * 60 # print "\n\n\n" # print "%s -- %s" % (q['time'], q['sql']) # reset_queries() logger.info( "Finished pulling results for poll #%d on org #%d runs in %ds, " "Times: sync_latest= %s," "Objects: created %d, updated %d, ignored %d" % ( poll.pk, org.pk, time.time() - start, latest_synced_obj_time, stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], )) return ( stats_dict["num_val_created"], stats_dict["num_val_updated"], stats_dict["num_val_ignored"], stats_dict["num_path_created"], stats_dict["num_path_updated"], stats_dict["num_path_ignored"], )
def pull_results(self, poll, modified_after, modified_before, progress_callback=None): org = poll.org r = get_redis_connection() key = Poll.POLL_PULL_RESULTS_TASK_LOCK % (org.pk, poll.pk) num_created = 0 num_updated = 0 num_ignored = 0 num_synced = 0 if r.get(key): print "Skipping pulling results for poll #%d on org #%d as it is still running" % ( poll.pk, org.pk) else: with r.lock(key): client = self._get_client(org, 2) # ignore the TaskState time and use the time we stored in redis now = timezone.now() after = cache.get( PollResult.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.pk), None) pull_after_delete = cache.get( Poll.POLL_PULL_ALL_RESULTS_AFTER_DELETE_FLAG % (org.pk, poll.pk), None) if pull_after_delete is not None: after = None poll.delete_poll_results() start = time.time() print "Start fetching runs for poll #%d on org #%d" % (poll.pk, org.pk) poll_runs_query = client.get_runs(flow=poll.flow_uuid, responded=True, after=after, before=now) fetches = poll_runs_query.iterfetches( retry_on_rate_exceed=True) existing_poll_results = PollResult.objects.filter( flow=poll.flow_uuid, org=poll.org_id) poll_results_map = defaultdict(dict) for res in existing_poll_results: poll_results_map[res.contact][res.ruleset] = res poll_results_to_save_map = defaultdict(dict) fetch_start = time.time() for fetch in fetches: print "RapidPro API fetch for poll #%d on org #%d %d - %d took %ds" % ( poll.pk, org.pk, num_synced, num_synced + len(fetch), time.time() - fetch_start) contact_uuids = [run.contact.uuid for run in fetch] contacts = Contact.objects.filter(org=org, uuid__in=contact_uuids) contacts_map = {c.uuid: c for c in contacts} for temba_run in fetch: flow_uuid = temba_run.flow.uuid contact_uuid = temba_run.contact.uuid completed = temba_run.exit_type == 'completed' contact_obj = contacts_map.get(contact_uuid, None) state = '' district = '' ward = '' if contact_obj is not None: state = contact_obj.state district = contact_obj.district ward = contact_obj.ward for temba_step in temba_run.steps: ruleset_uuid = temba_step.node category = temba_step.category text = temba_step.text existing_poll_result = poll_results_map.get( contact_uuid, dict()).get(ruleset_uuid, None) poll_result_to_save = poll_results_to_save_map.get( contact_uuid, dict()).get(ruleset_uuid, None) if existing_poll_result is not None: update_required = existing_poll_result.category != category or existing_poll_result.text != text update_required = update_required or existing_poll_result.state != state update_required = update_required or existing_poll_result.district != district update_required = update_required or existing_poll_result.ward != ward update_required = update_required or existing_poll_result.completed != completed # if the reporter answered the step, check if this is a newer run if existing_poll_result.date is not None: update_required = update_required and ( temba_step.left_on is None or temba_step.arrived_on > existing_poll_result.date) if update_required: PollResult.objects.filter( pk=existing_poll_result.pk).update( category=category, text=text, state=state, district=district, ward=ward, date=temba_step.left_on, completed=completed) num_updated += 1 else: num_ignored += 1 elif poll_result_to_save is not None: replace_save_map = poll_result_to_save.category != category or poll_result_to_save.text != text replace_save_map = replace_save_map or poll_result_to_save.state != state replace_save_map = replace_save_map or poll_result_to_save.district != district replace_save_map = replace_save_map or poll_result_to_save.ward != ward replace_save_map = replace_save_map or poll_result_to_save.completed != completed # replace if the step is newer if poll_result_to_save.date is not None: replace_save_map = replace_save_map and ( temba_step.left_on is None or temba_step.arrived_on > poll_result_to_save.date) if replace_save_map: result_obj = PollResult( org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, date=temba_step.left_on, completed=completed) poll_results_to_save_map[contact_uuid][ ruleset_uuid] = result_obj num_ignored += 1 else: result_obj = PollResult( org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, date=temba_step.left_on, completed=completed) poll_results_to_save_map[contact_uuid][ ruleset_uuid] = result_obj num_created += 1 num_synced += len(fetch) if progress_callback: progress_callback(num_synced) print "Processed fetch of %d - %d runs for poll #%d on org #%d" % ( num_synced - len(fetch), num_synced, poll.pk, org.pk) fetch_start = time.time() print "=" * 40 new_poll_results = [] for c_key in poll_results_to_save_map.keys(): for r_key in poll_results_to_save_map.get(c_key, dict()): obj_to_create = poll_results_to_save_map.get( c_key, dict()).get(r_key, None) if obj_to_create is not None: new_poll_results.append(obj_to_create) PollResult.objects.bulk_create(new_poll_results) # update the time for this poll from which we fetch next time cache.set( PollResult.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.pk), datetime_to_json_date(now.replace(tzinfo=pytz.utc)), None) # from django.db import connection as db_connection, reset_queries # slowest_queries = sorted(db_connection.queries, key=lambda q: q['time'], reverse=True)[:10] # for q in slowest_queries: # print "=" * 60 # print "\n\n\n" # print "%s -- %s" % (q['time'], q['sql']) # reset_queries() print "Finished pulling results for poll #%d on org #%d runs in %ds, " \ "created %d, updated %d, ignored %d" % (poll.pk, org.pk, time.time() - start, num_created, num_updated, num_ignored) return num_created, num_updated, num_ignored
def pull_results(self, poll, modified_after, modified_before, progress_callback=None): org = poll.org r = get_redis_connection() key = Poll.POLL_PULL_RESULTS_TASK_LOCK % (org.pk, poll.pk) num_created = 0 num_updated = 0 num_ignored = 0 num_synced = 0 if r.get(key): print "Skipping pulling results for poll #%d on org #%d as it is still running" % (poll.pk, org.pk) else: with r.lock(key): client = self._get_client(org, 2) # ignore the TaskState time and use the time we stored in redis now = timezone.now() after = cache.get(PollResult.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.pk), None) pull_after_delete = cache.get(Poll.POLL_PULL_ALL_RESULTS_AFTER_DELETE_FLAG % (org.pk, poll.pk), None) if pull_after_delete is not None: after = None poll.delete_poll_results() start = time.time() print "Start fetching runs for poll #%d on org #%d" % (poll.pk, org.pk) poll_runs_query = client.get_runs(flow=poll.flow_uuid, responded=True, after=after, before=now) fetches = poll_runs_query.iterfetches(retry_on_rate_exceed=True) existing_poll_results = PollResult.objects.filter(flow=poll.flow_uuid, org=poll.org_id) poll_results_map = defaultdict(dict) for res in existing_poll_results: poll_results_map[res.contact][res.ruleset] = res poll_results_to_save_map = defaultdict(dict) fetch_start = time.time() for fetch in fetches: print "RapidPro API fetch for poll #%d on org #%d %d - %d took %ds" % ( poll.pk, org.pk, num_synced, num_synced + len(fetch), time.time() - fetch_start, ) contact_uuids = [run.contact.uuid for run in fetch] contacts = Contact.objects.filter(org=org, uuid__in=contact_uuids) contacts_map = {c.uuid: c for c in contacts} for temba_run in fetch: flow_uuid = temba_run.flow.uuid contact_uuid = temba_run.contact.uuid completed = temba_run.exit_type == "completed" contact_obj = contacts_map.get(contact_uuid, None) state = "" district = "" ward = "" if contact_obj is not None: state = contact_obj.state district = contact_obj.district ward = contact_obj.ward for temba_step in temba_run.steps: ruleset_uuid = temba_step.node category = temba_step.category text = temba_step.text existing_poll_result = poll_results_map.get(contact_uuid, dict()).get(ruleset_uuid, None) poll_result_to_save = poll_results_to_save_map.get(contact_uuid, dict()).get( ruleset_uuid, None ) if existing_poll_result is not None: update_required = ( existing_poll_result.category != category or existing_poll_result.text != text ) update_required = update_required or existing_poll_result.state != state update_required = update_required or existing_poll_result.district != district update_required = update_required or existing_poll_result.ward != ward update_required = update_required or existing_poll_result.completed != completed # if the reporter answered the step, check if this is a newer run if existing_poll_result.date is not None: update_required = update_required and ( temba_step.left_on is None or temba_step.arrived_on > existing_poll_result.date ) if update_required: PollResult.objects.filter(pk=existing_poll_result.pk).update( category=category, text=text, state=state, district=district, ward=ward, date=temba_step.left_on, completed=completed, ) num_updated += 1 else: num_ignored += 1 elif poll_result_to_save is not None: replace_save_map = ( poll_result_to_save.category != category or poll_result_to_save.text != text ) replace_save_map = replace_save_map or poll_result_to_save.state != state replace_save_map = replace_save_map or poll_result_to_save.district != district replace_save_map = replace_save_map or poll_result_to_save.ward != ward replace_save_map = replace_save_map or poll_result_to_save.completed != completed # replace if the step is newer if poll_result_to_save.date is not None: replace_save_map = replace_save_map and ( temba_step.left_on is None or temba_step.arrived_on > poll_result_to_save.date ) if replace_save_map: result_obj = PollResult( org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, date=temba_step.left_on, completed=completed, ) poll_results_to_save_map[contact_uuid][ruleset_uuid] = result_obj num_ignored += 1 else: result_obj = PollResult( org=org, flow=flow_uuid, ruleset=ruleset_uuid, contact=contact_uuid, category=category, text=text, state=state, district=district, ward=ward, date=temba_step.left_on, completed=completed, ) poll_results_to_save_map[contact_uuid][ruleset_uuid] = result_obj num_created += 1 num_synced += len(fetch) if progress_callback: progress_callback(num_synced) print "Processed fetch of %d - %d runs for poll #%d on org #%d" % ( num_synced - len(fetch), num_synced, poll.pk, org.pk, ) fetch_start = time.time() print "=" * 40 new_poll_results = [] for c_key in poll_results_to_save_map.keys(): for r_key in poll_results_to_save_map.get(c_key, dict()): obj_to_create = poll_results_to_save_map.get(c_key, dict()).get(r_key, None) if obj_to_create is not None: new_poll_results.append(obj_to_create) PollResult.objects.bulk_create(new_poll_results) # update the time for this poll from which we fetch next time cache.set( PollResult.POLL_RESULTS_LAST_PULL_CACHE_KEY % (org.pk, poll.pk), datetime_to_json_date(now.replace(tzinfo=pytz.utc)), None, ) # from django.db import connection as db_connection, reset_queries # slowest_queries = sorted(db_connection.queries, key=lambda q: q['time'], reverse=True)[:10] # for q in slowest_queries: # print "=" * 60 # print "\n\n\n" # print "%s -- %s" % (q['time'], q['sql']) # reset_queries() print "Finished pulling results for poll #%d on org #%d runs in %ds, " "created %d, updated %d, ignored %d" % ( poll.pk, org.pk, time.time() - start, num_created, num_updated, num_ignored, ) return num_created, num_updated, num_ignored