def get_agency_ids(self): if self.memoized_agency_ids_for_search is not None and self.last_memoized_time > dt.datetime.now( ) - dt.timedelta(days=1): return self.memoized_agency_ids_for_search else: self.last_memoized_time = dt.datetime.now() query = { "query": { "bool": { "must": { "term": { "type": "federal_executive" } } } }, # n.b. 1000 is arbitrary but needs to be set higher than the total # of fed agencies (currently ~400) "size": 1000, "_source": { "include": ["id"] } } agency_ids = [ a['id'] for a in jsearch.query_records(query, 'agencies') ] self.memoized_agency_ids_for_search = agency_ids return self.memoized_agency_ids_for_search
def get_entity_by_ids_and_type(entity_type, entity_ids): if entity_type not in ALLOWED_ENTITY_TYPES: return {"errors": "invalid type"} if isinstance(entity_ids, list) is not True: return {"errors": "invalid type"} query = {"query": {"bool": {"must": {"terms": {"id": entity_ids}}}}} return jsearch.query_records(query, entity_type)
def get_entity_by_name(type, name): full_request = filtered_request_template() full_request["query"] = { "match": { "name": name, } } return jsearch.query_records(full_request, type)[0]
def load_named_clusters(self): self.named_clusters = {} act_query = { "query": { "ids": { "values": self.cited_records('acts') } }, "size": 1000 } act_hits = jsearch.query_records(act_query, 'acts') self.named_clusters.update( {"acts_{}".format(x['id']): x for x in act_hits}) reg_query = {"query": {"ids": {"values": self.cited_records('named')}}} reg_hits = jsearch.query_records(reg_query, 'named_regulations') self.named_clusters.update( {"named_regulations_{}".format(x['id']): x for x in reg_hits})
def load_documents_map(self): query = { "query": { "ids": { "values": self.cited_records('documents') } }, "size": 1000 } self.docs_map = { "documents_{}".format(x['id']): x for x in jsearch.query_records(query, 'documents') }
def get_state_by_short_name(type, short_name): full_request = { "query": { "bool": { "must": { "match": { "short_name": short_name, } } } } } return jsearch.query_records(full_request, type)[0]
def get_most_popular_docs(params): num_queries = params.get("num_queries", 5) # n.b. 5 is an arbitrary default # use thirty days ago as the limit of the time range for popularity # n.b. bookmarkings could potentially impact this - but that is another potential way to gauge popularity so # that should be ok thirty_days_ago = dt.datetime.now() - dt.timedelta(days=30) most_popular_docs = db_session_users.query(UserDocument.doc_id, func.count(UserDocument.doc_id).label('total'))\ .filter_by(read=True).filter(UserDocument.user_id.in_(get_external_user_id_subquery()))\ .filter(UserDocument.updated_at > thirty_days_ago).group_by(UserDocument.doc_id)\ .order_by(text('total desc')).limit(num_queries).all() # retrieve the document titles so we can include them in the payload with just one extra ES call query = { "query": { "bool": { "must": { "terms": { "id": [d[0] for d in most_popular_docs] } } } }, "_source": { "include": ["id", "title"] } } docs_with_titles = jsearch.query_records(query, 'documents') # create a map of the doc titles so we can easily tack this on to the id/count below doc_id_title_map = {d['id']: d['title'] for d in docs_with_titles} return { "popular_docs": [{ "doc_id": d[0], "title": doc_id_title_map[d[0]], "count": d[1] } for d in most_popular_docs] }
def decorate_documents_with_filter_mentions(docs, act_id_list, regulation_id_list, concept_id_list, bank_id_list, limit): filter_term_hash = None if len(act_id_list) > 0: filter_term_hash = {"act_id": act_id_list[0]} elif len(regulation_id_list) > 0: filter_term_hash = {"named_regulation_id": regulation_id_list[0]} elif len(concept_id_list) > 0: filter_term_hash = {"concept_id": concept_id_list[0]} elif len(bank_id_list) > 0: filter_term_hash = {"bank_id": bank_id_list[0]} # n.b. agency is skipped here because it is less resolve-y and more that's where we got the data from-y if filter_term_hash is not None: doc_ids = [d['id'] for d in docs] doc_id_index_map = {v: i for i, v in enumerate(doc_ids)} doc_cite_request = filtered_request_template() doc_cite_request["size"] = limit doc_cite_request["_source"] = {"include": ["doc_id", "mentions"]} doc_cite_request["query"]["bool"] = { "must": [ { "terms": { "doc_id": doc_ids, } }, { "term": filter_term_hash } ] } doc_citations_results = jsearch.query_records(doc_cite_request, doc_type='document_citations') for dc_result in doc_citations_results: result_array_index = doc_id_index_map[dc_result['doc_id']] docs[result_array_index]['mentions_for_filter'] = dc_result['mentions'] return docs
def get_filtered_agencies(params): query = {} following = params.get('following', None) search_filter = params.get('search_filter', None) if following: followed = get_followed_agency_ids_with_backoff(g.user_id) following = str_to_bool(following) if following: query = {"query": {"bool": {"must": es_filter(followed, "id")}}} else: query = { "query": { "bool": { "must_not": es_filter(followed, "id") } } } elif search_filter: query = {"query": {"bool": {"must": es_filter("true", "active")}}} query["size"] = 500 query["_source"] = {"include": INCLUDED_AGENCY_FIELDS} return jsearch.query_records(query, 'agencies')
def activate_user(params): email = params.get('email') token = params.get('token') new_password = params.get('new_password') first_name = params.get('first_name') last_name = params.get('last_name') is_contributor = params.get('is_contributor') dry_run = params.get('dry_run', False) # validate token, email, enabled state only linkedin_id = params.get('linkedin_id') google_id = params.get('google_id') enabled = params.get('enabled') # params to go into json field in db json_params = [ 'agencies', 'state_agencies', 'other_agencies', 'other_state_agencies', 'other_topics', 'user_style' ] def error_response(msg='Invalid email or token'): response = jsonify({ 'error': msg, }) response.status_code = 400 return response # confirmation_required variable tracks whether this is an activation sourced from a marketing campaign, # a signup withouot a token, or from the invite -> activate flow. # use confirmation_required to indicate we need to send a confirmation email later on confirmation_required = False marketing_campaign = db_session_users.query(MarketingCampaign).filter_by(token=token).first() if marketing_campaign is not None or token is None: confirmation_required = True else: if email is None: return error_response() else: email = email.lower() g.user_email = email user = db_session_users.query(User).filter_by(email=email).scalar() if user is None: return error_response() if dry_run and user.enabled: return error_response('User is already enabled') enabled_at_start = user.enabled if not user.reset_token or user.reset_token != token: # send an email to support, but only if the user is in the db to prevent spamming if dry_run: template_vars = { 'email': email, } email_helper.send_email( '*****@*****.**', '*****@*****.**', 'A user attempted to use an invalid token during activation', template='activate-fail', vars=template_vars, ) return error_response() if dry_run: return jsonify({'marketing_campaign': marketing_campaign is not None}) if not new_password: return error_response('Missing fields') # for the marketing campaign approach, create an entry in the users table, # for the invite-based registration approach, mark the user enabled if confirmation_required: email = email.lower() g.user_email = email # check if this user exists in the database (the invite use-case), so we can use the existing entry if so # and create a new entry if not user = db_session_users.query(User).filter_by(email=email).first() # this is for when a user comes to our site without being invited through the admin tool if user is None: user = User({ 'email': email, 'first_name': first_name, 'last_name': last_name, 'password': new_password, 'enabled': False, }) # this is for when the user is instead invited to our site, but then instead of trying to enter via the # invitation link, they use the regular user signup flow. they will now get the confirmation email # and have to fully activate their account there else: # triple check to prevent any shenanigans for enabled users, or user accounts # that somehow exist but were not invited, and also if the invite has already been skipped # and we have successfully moved onto the confirmation step # n.b. relying on hash values is a little funky here, but it seems to work if user.enabled or "invited_by" not in user.properties or "invite_skipped" in user.properties: return error_response() user.properties["invite_skipped"] = True # n.b. record that the invite workflow was skipped user.first_name = first_name user.last_name = last_name user.update_password(new_password) if linkedin_id: user.linkedin = linkedin_id user.industry = params.get('industry') user.company = params.get('company') user.properties['linkedin_data'] = params.get('linkedin_data') user.enabled = enabled user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat() if google_id: user.google_id = google_id user.enabled = enabled user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat() # mark internal users with the internal user flag so we can differentiate user types when # calculating various stats if email.endswith("@jurispect.com") or email.endswith("@compliance.ai"): user.is_internal_user = True if marketing_campaign is not None: user.marketing_campaigns.append(marketing_campaign) user.gen_reset_token() enabled_at_start = False try: _send_activation_email('confirm', user) except SMTPException: db_session_users.rollback() return error_response('Could not send email', code=500) else: user.enabled = True user.update_password(new_password) if first_name: user.first_name = first_name if last_name: user.last_name = last_name # only allow the token to be used once: user.reset_token = None new_props = {p: params[p] for p in json_params if params.get(p)} # n.b. since this route is shared with password resets, we need to skip updating the activation time # when it is a password reset action if not enabled_at_start: new_props['activation_time'] = datetime.datetime.utcnow().isoformat() if not params.get('user_style') and email.endswith('@firstrepublic.com'): new_props['user_style'] = 'first-republic' if len(new_props) > 0: user.properties = merge_two_dicts(user.properties, new_props) if is_contributor: user.roles = ['contributor'] # FIXME: this is needed for marketing-campaign sourced users but yields a double commit # probably not super noticeable, but should fix if we have the time db_session_users.add(user) try: db_session_users.commit() except IntegrityError: return error_response() db_session_users.refresh(user) topic_ids = [] topic_ids.extend(params.get('topics', AggregatedAnnotations.topic_id_name_mapping.keys())) for topic_id in topic_ids: userTopic = UserTopic({ 'user_id': user.id, 'topic_id': topic_id, 'following': True }) db_session_users.add(userTopic) news_ids = [x['id'] for x in jsearch.query_records({'size': 1000}, doc_type='news_sources')] for news_id in news_ids: userFollowedEntity = UserFollowedEntity({ 'user_id': user.id, 'entity_id': news_id, 'entity_type': 'news_sources', 'following': True }) db_session_users.add(userFollowedEntity) agency_ids = [] agency_ids.extend(params.get('agencies', [])) new_ids = [] # verify that the agency ids are correct # using DefaultAgenciesToFollowAtSignup since users now skip onboarding for agency_id in DefaultAgenciesToFollowAtSignup: try: agency = jsearch.get_record(agency_id, 'agencies') new_ids.append(agency['id']) except NotFoundError: pass for agency_id in new_ids: user_agency = UserAgency({'user_id': user.id, 'agency_id': agency_id, 'following': True}) db_session_users.add(user_agency) state_jurisdictions = [] state_jurisdictions.extend(params.get('state_agencies', [])) state_ids = [] # get selected state jurisdiction ids and add them to follow entity table for state_jurisdiction in state_jurisdictions: try: state = get_state_by_short_name('jurisdictions', state_jurisdiction) state_ids.append(state['id']) except NotFoundError: pass updated_followed_entity(user.id, {'entities': [{ 'entity_id': state_id, 'entity_type': 'jurisdictions', 'following': True } for state_id in state_ids]}) # send a support mail if the user requests a new source other_agencies = params.get('other_agencies', '') other_state_agencies = params.get('other_state_agencies', '') other_topics = params.get('other_topics', '') if other_agencies or other_state_agencies or other_topics: template_vars = { 'other_agencies': other_agencies, 'other_state_agencies': other_state_agencies, 'other_topics': other_topics, 'name': '%s %s' % (first_name, last_name), 'email': email, } email_helper.send_email( '*****@*****.**', '*****@*****.**', 'A new user has requested additional sources or topics', template='additional-sources', vars=template_vars, ) try: db_session_users.commit() except IntegrityError: return error_response() # start free trials. user = db_session_users.query(User).filter_by(email=email.lower()).first() latest_subscription = db_session_users.query(Subscription).filter_by(user_id=user.id, latest=True).first() if latest_subscription is None: # new users with .edu email get a 120 month free trial. if user.email.endswith('.edu'): subscribe_users_to_plan([user.id],'free_trial_120months') # all other users get a 1 month free trial else: start_free_trial(user.id) create_folder(user.id, {'name': 'Read'}) create_folder(user.id, {'name': 'Bookmarked'}) if confirmation_required: # special case login for unenabled marketing campaign users allow access for 7 days only expiration_datetime = datetime.datetime.utcnow() + datetime.timedelta(days=7) token = jwt.encode({'user_id': user.id, 'exp': expiration_datetime}, SECRET_JWT) # Flag 'is_new' defines if user is returning or just registered. True means just registered user return jsonify({"jwt_token": token, "is_new": True, 'email': email.lower()}) else: # return empty if user not from marketing campaign return jsonify({})
def get_aggregated_annotations(topic_id, params): """ For each row in metadata table view, have dict that gives immediate row AND the expanded view that comes from clicking on that row. All this data is returned in paginated batches (default 20 at a time). INPUT: topic_id: id name of topic (REQUIRED) see list of available topic names and their ids at AggregatedAnnotations.topic_id_name_mapping (dict of {id, topic_name} pairs) Params (dict) For filter/sort, have specific key for each type of filter/sort required For sorting, the value of the key doesn't matter (can be None). For filtering, the key is the column name and the value is the value to filter on. Available keys can be found in the "filtering and sorting" section in code below. params["limit"]: pagination size (optional) params["count_only"]: return only size of query (optional) params["offset"]: offset for pagination (optional) OUTPUT: return_dict (dict) return_dict looks like this: { 'aggregated_annotations': agg_annotations, 'total': job_count_total } job_count_total is integer count of how many aggregated_annotations would be returned with this filtering in the query BEFORE taking into account pagination agg_annotations is a list of aggregated_annotation dictionaries Each aggregated_annotation dict has all keys from aggregated_annotations.to_dict(), we well as: "annotation_task_topic_group_name": name of corresponding annotation task topic group "annotation_task_topic_group_description": description of corresponding annotation task topic group "doc_title": title of document (as pulled from elasticsearch) "judges": list of email strings of topic judges on document "topic_annotations": topic_annotation dict, with additional "annotation_job" key for corresponding annotation_job dict """ QUERY_SIZE_LIMIT = 20 # size limit on query for postgres pagination topic_dict = AggregatedAnnotations.topic_id_name_mapping # get base query - all aggregated annotations under a specific topic # for now topic_id is required - to make it optional, later querying will have to be modified if topic_id: base_query = db_session_users.query(AggregatedAnnotations).filter_by(topic_id=topic_id) else: return jsonify({'error': "get_aggregated_annotations route requires valid topic_id"}), 400 ########################## # filtering and sorting ########################## # filtering # top-level keys whose values give the value to filter on if 'filter_doc_id' in params: base_query = base_query.filter_by(doc_id=params['filter_doc_id']) if 'filter_is_in_agreement' in params: base_query = base_query.filter_by(is_in_agreement=params['filter_is_in_agreement']) if 'filter_is_gold_standard' in params: base_query = base_query.filter_by(is_gold_standard=params['filter_is_gold_standard']) if 'filter_is_active_for_gold_annotation' in params: base_query = base_query.filter_by(is_active_for_gold_annotation=params['filter_is_active_for_gold_annotation']) if 'filter_gold_difficulty' in params: base_query = base_query.filter_by(gold_difficulty=params['filter_gold_difficulty']) # sorting # NB: the values here don't matter - just the presence of the key if 'sorting_doc_id' in params: if params['sorting_doc_id'] == 'ascending': base_query = base_query.order_by(AggregatedAnnotations.doc_id.asc()) if params['sorting_doc_id'] == 'descending': base_query = base_query.order_by(AggregatedAnnotations.doc_id.desc()) if 'sorting_is_gold_standard' in params: if params['sorting_is_gold_standard'] == 'ascending': base_query = base_query.order_by(AggregatedAnnotations.is_gold_standard.asc()) if params['sorting_is_gold_standard'] == 'descending': base_query = base_query.order_by(AggregatedAnnotations.is_gold_standard.desc()) if 'sorting_gold_difficulty' in params: if params['sorting_gold_difficulty'] == 'ascending': base_query = base_query.order_by(AggregatedAnnotations.gold_difficulty.asc()) if params['sorting_gold_difficulty'] == 'descending': base_query = base_query.order_by(AggregatedAnnotations.gold_difficulty.desc()) if 'sorting_is_in_agreement' in params: if params['sorting_is_in_agreement'] == 'ascending': base_query = base_query.order_by(AggregatedAnnotations.is_in_agreement.asc()) if params['sorting_is_in_agreement'] == 'descending': base_query = base_query.order_by(AggregatedAnnotations.is_in_agreement.desc()) if 'sorting_arbitrary_tags' in params: if params['sorting_arbitrary_tags'] == 'ascending': base_query = base_query.order_by(AggregatedAnnotations.arbitrary_tags.asc()) if params['sorting_arbitrary_tags'] == 'descending': base_query = base_query.order_by(AggregatedAnnotations.arbitrary_tags.desc()) if 'sorting_notes' in params: if params['sorting_notes'] == 'ascending': base_query = base_query.order_by(AggregatedAnnotations.notes.asc()) if params['sorting_notes'] == 'descending': base_query = base_query.order_by(AggregatedAnnotations.notes.desc()) ##################################################################### # do pagination with offset (see annotation_job_helper.py, line 89) ##################################################################### # get the total number of aggregated annotations found BEFORE the pagination limit+offset breakdown job_count_total = base_query.count() # return only count if necessary if 'count_only' in params: return jsonify({'total': job_count_total}) # n.b. allows pagination if 'offset' in params: base_query = base_query.offset(params['offset']) # n.b. 20 seems a reasonable default limit - can be changed depending on needs limit = params.get('limit', QUERY_SIZE_LIMIT) base_query = base_query.limit(limit) ############################################ # create list of dicts that will be returned ############################################ agg_annotations = [agg.to_dict() for agg in base_query] ################################################################ # add name and description of each annotation task topic group ################################################################ for agg in agg_annotations: group = db_session_users.query(AnnotationTaskTopicGroup).filter_by(id=agg['annotation_task_group_id']).first() agg['annotation_task_topic_group_name'] = group.name agg['annotation_task_topic_group_description'] = group.description #################################### # query Elasticsearch and postgres #################################### # get doc_ids for queries in ES and postgres doc_ids = [agg["doc_id"] for agg in agg_annotations] # Elasticsearch query to get document titles query = { "size": len(doc_ids), "query": { "bool": { "must": { "terms": {"id": doc_ids} } } }, "_source": {"include": ["id", "title"]} } # make doc_title_dict keyed by doc_id for fast lookup doc_title_dict = {d["id"]: d["title"] for d in jsearch.query_records(query, 'documents')} # N.B.: jsearch.query_records(query, 'documents') returns array with elements of form # {'id': 240394, 'title': "Blah blah blah"} # get all topic_annotations with this topic and these doc_ids # subqueryload used to load AnnotationJob objects explicitly topic_name = None if topic_id is not None: topic_name = topic_dict[topic_id] # topic_name = None ta_query_result = db_session_users.query(TopicAnnotation)\ .options(subqueryload(TopicAnnotation.annotation_job))\ .options(subqueryload(TopicAnnotation.user))\ .filter(TopicAnnotation.doc_id.in_(doc_ids))\ if topic_id is not None: ta_query_result = ta_query_result.filter_by(topic_name=topic_dict[topic_id]) ta_query_result = ta_query_result.all() ############################################################ # aggregated desired fields from ES/postgres query results ############################################################ # make doc_id-keyed dict of selected agg_ants for efficiently aggregating results agg_annot_dict = {agg["doc_id"]: agg for agg in agg_annotations} # set up empty lists to store topic_annotations and judges in each agg dict; # collect document titles into each agg dict for agg in agg_annotations: agg["topic_annotations"] = [] agg["judges"] = [] if agg["doc_id"] in doc_title_dict: agg["doc_title"] = doc_title_dict[agg["doc_id"]] # collect TopicAnnotations, AnnotationJobs and judges into agg dictionaries for ta in ta_query_result: matching_agg = agg_annot_dict[ta.doc_id] # get relevant aggregated_annotation ta_dict = ta.to_dict() # get topic_annotation dict ta_dict["annotation_job"] = ta.annotation_job.to_dict() # using subqueryload of annotation_job matching_agg["topic_annotations"].append(ta_dict) # TopicAnnotation matching_agg["judges"].append(ta.user.email) # append to list of judges (used subquery here as well) # NB: in case we want to include first_name and last_name at a later date: # matching_agg["judges"].append(ta.user.first_name + ta.user.last_name) # append to list of judges # (used subqueryload here as well) # return result return jsonify({'aggregated_annotations': agg_annotations, 'total': job_count_total})