예제 #1
0
    def get_agency_ids(self):
        if self.memoized_agency_ids_for_search is not None and self.last_memoized_time > dt.datetime.now(
        ) - dt.timedelta(days=1):
            return self.memoized_agency_ids_for_search
        else:
            self.last_memoized_time = dt.datetime.now()

            query = {
                "query": {
                    "bool": {
                        "must": {
                            "term": {
                                "type": "federal_executive"
                            }
                        }
                    }
                },
                # n.b. 1000 is arbitrary but needs to be set higher than the total # of fed agencies (currently ~400)
                "size": 1000,
                "_source": {
                    "include": ["id"]
                }
            }
            agency_ids = [
                a['id'] for a in jsearch.query_records(query, 'agencies')
            ]

            self.memoized_agency_ids_for_search = agency_ids
            return self.memoized_agency_ids_for_search
def get_entity_by_ids_and_type(entity_type, entity_ids):

    if entity_type not in ALLOWED_ENTITY_TYPES:
        return {"errors": "invalid type"}
    if isinstance(entity_ids, list) is not True:
        return {"errors": "invalid type"}
    query = {"query": {"bool": {"must": {"terms": {"id": entity_ids}}}}}
    return jsearch.query_records(query, entity_type)
예제 #3
0
def get_entity_by_name(type, name):
    full_request = filtered_request_template()

    full_request["query"] = {
        "match": {
            "name": name,
        }
    }
    return jsearch.query_records(full_request, type)[0]
예제 #4
0
    def load_named_clusters(self):
        self.named_clusters = {}

        act_query = {
            "query": {
                "ids": {
                    "values": self.cited_records('acts')
                }
            },
            "size": 1000
        }
        act_hits = jsearch.query_records(act_query, 'acts')
        self.named_clusters.update(
            {"acts_{}".format(x['id']): x
             for x in act_hits})

        reg_query = {"query": {"ids": {"values": self.cited_records('named')}}}
        reg_hits = jsearch.query_records(reg_query, 'named_regulations')
        self.named_clusters.update(
            {"named_regulations_{}".format(x['id']): x
             for x in reg_hits})
예제 #5
0
 def load_documents_map(self):
     query = {
         "query": {
             "ids": {
                 "values": self.cited_records('documents')
             }
         },
         "size": 1000
     }
     self.docs_map = {
         "documents_{}".format(x['id']): x
         for x in jsearch.query_records(query, 'documents')
     }
def get_state_by_short_name(type, short_name):
    full_request = {
        "query": {
            "bool": {
                "must": {
                    "match": {
                        "short_name": short_name,
                    }
                }
            }
        }
    }

    return jsearch.query_records(full_request, type)[0]
예제 #7
0
def get_most_popular_docs(params):
    num_queries = params.get("num_queries",
                             5)  # n.b. 5 is an arbitrary default

    # use thirty days ago as the limit of the time range for popularity
    # n.b. bookmarkings could potentially impact this - but that is another potential way to gauge popularity so
    # that should be ok
    thirty_days_ago = dt.datetime.now() - dt.timedelta(days=30)

    most_popular_docs = db_session_users.query(UserDocument.doc_id, func.count(UserDocument.doc_id).label('total'))\
        .filter_by(read=True).filter(UserDocument.user_id.in_(get_external_user_id_subquery()))\
        .filter(UserDocument.updated_at > thirty_days_ago).group_by(UserDocument.doc_id)\
        .order_by(text('total desc')).limit(num_queries).all()

    # retrieve the document titles so we can include them in the payload with just one extra ES call
    query = {
        "query": {
            "bool": {
                "must": {
                    "terms": {
                        "id": [d[0] for d in most_popular_docs]
                    }
                }
            }
        },
        "_source": {
            "include": ["id", "title"]
        }
    }
    docs_with_titles = jsearch.query_records(query, 'documents')

    # create a map of the doc titles so we can easily tack this on to the id/count below
    doc_id_title_map = {d['id']: d['title'] for d in docs_with_titles}

    return {
        "popular_docs": [{
            "doc_id": d[0],
            "title": doc_id_title_map[d[0]],
            "count": d[1]
        } for d in most_popular_docs]
    }
예제 #8
0
def decorate_documents_with_filter_mentions(docs, act_id_list, regulation_id_list, concept_id_list, bank_id_list, limit):
    filter_term_hash = None
    if len(act_id_list) > 0:
        filter_term_hash = {"act_id": act_id_list[0]}
    elif len(regulation_id_list) > 0:
        filter_term_hash = {"named_regulation_id": regulation_id_list[0]}
    elif len(concept_id_list) > 0:
        filter_term_hash = {"concept_id": concept_id_list[0]}
    elif len(bank_id_list) > 0:
        filter_term_hash = {"bank_id": bank_id_list[0]}
    # n.b. agency is skipped here because it is less resolve-y and more that's where we got the data from-y

    if filter_term_hash is not None:
        doc_ids = [d['id'] for d in docs]
        doc_id_index_map = {v: i for i, v in enumerate(doc_ids)}
        doc_cite_request = filtered_request_template()
        doc_cite_request["size"] = limit
        doc_cite_request["_source"] = {"include": ["doc_id", "mentions"]}
        doc_cite_request["query"]["bool"] = {
            "must": [
                {
                    "terms": {
                        "doc_id": doc_ids,
                    }
                },
                {
                    "term": filter_term_hash
                }
            ]
        }

        doc_citations_results = jsearch.query_records(doc_cite_request, doc_type='document_citations')
        for dc_result in doc_citations_results:
            result_array_index = doc_id_index_map[dc_result['doc_id']]
            docs[result_array_index]['mentions_for_filter'] = dc_result['mentions']

    return docs
def get_filtered_agencies(params):
    query = {}
    following = params.get('following', None)
    search_filter = params.get('search_filter', None)
    if following:
        followed = get_followed_agency_ids_with_backoff(g.user_id)
        following = str_to_bool(following)

        if following:
            query = {"query": {"bool": {"must": es_filter(followed, "id")}}}
        else:
            query = {
                "query": {
                    "bool": {
                        "must_not": es_filter(followed, "id")
                    }
                }
            }
    elif search_filter:
        query = {"query": {"bool": {"must": es_filter("true", "active")}}}
    query["size"] = 500
    query["_source"] = {"include": INCLUDED_AGENCY_FIELDS}

    return jsearch.query_records(query, 'agencies')
def activate_user(params):
    email = params.get('email')
    token = params.get('token')
    new_password = params.get('new_password')
    first_name = params.get('first_name')
    last_name = params.get('last_name')

    is_contributor = params.get('is_contributor')
    dry_run = params.get('dry_run', False)  # validate token, email, enabled state only

    linkedin_id = params.get('linkedin_id')
    google_id = params.get('google_id')
    enabled = params.get('enabled')

    # params to go into json field in db
    json_params = [
        'agencies', 'state_agencies',
        'other_agencies', 'other_state_agencies', 'other_topics',
        'user_style'
    ]

    def error_response(msg='Invalid email or token'):
        response = jsonify({
            'error': msg,
        })
        response.status_code = 400
        return response

    # confirmation_required variable tracks whether this is an activation sourced from a marketing campaign,
    # a signup withouot a token, or from the invite -> activate flow.
    # use confirmation_required to indicate we need to send a confirmation email later on
    confirmation_required = False
    marketing_campaign = db_session_users.query(MarketingCampaign).filter_by(token=token).first()
    if marketing_campaign is not None or token is None:
        confirmation_required = True
    else:
        if email is None:
            return error_response()
        else:
            email = email.lower()
            g.user_email = email

        user = db_session_users.query(User).filter_by(email=email).scalar()

        if user is None:
            return error_response()

        if dry_run and user.enabled:
            return error_response('User is already enabled')

        enabled_at_start = user.enabled

        if not user.reset_token or user.reset_token != token:
            # send an email to support, but only if the user is in the db to prevent spamming
            if dry_run:
                template_vars = {
                    'email': email,
                }
                email_helper.send_email(
                    '*****@*****.**',
                    '*****@*****.**',
                    'A user attempted to use an invalid token during activation',
                    template='activate-fail',
                    vars=template_vars,
                )

            return error_response()

    if dry_run:
        return jsonify({'marketing_campaign': marketing_campaign is not None})

    if not new_password:
        return error_response('Missing fields')

    # for the marketing campaign approach, create an entry in the users table,
    # for the invite-based registration approach, mark the user enabled
    if confirmation_required:
        email = email.lower()
        g.user_email = email

        # check if this user exists in the database (the invite use-case), so we can use the existing entry if so
        # and create a new entry if not
        user = db_session_users.query(User).filter_by(email=email).first()

        # this is for when a user comes to our site without being invited through the admin tool
        if user is None:
            user = User({
                'email': email,
                'first_name': first_name,
                'last_name': last_name,
                'password': new_password,
                'enabled': False,
            })


        # this is for when the user is instead invited to our site, but then instead of trying to enter via the
        # invitation link, they use the regular user signup flow. they will now get the confirmation email
        # and have to fully activate their account there
        else:
            # triple check to prevent any shenanigans for enabled users, or user accounts
            # that somehow exist but were not invited, and also if the invite has already been skipped
            # and we have successfully moved onto the confirmation step
            # n.b. relying on hash values is a little funky here, but it seems to work
            if user.enabled or "invited_by" not in user.properties or "invite_skipped" in user.properties:
                return error_response()

            user.properties["invite_skipped"] = True  # n.b. record that the invite workflow was skipped
            user.first_name = first_name
            user.last_name = last_name
            user.update_password(new_password)

        if linkedin_id:
            user.linkedin = linkedin_id
            user.industry = params.get('industry')
            user.company = params.get('company')
            user.properties['linkedin_data'] = params.get('linkedin_data')
            user.enabled = enabled
            user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat()

        if google_id:
            user.google_id = google_id
            user.enabled = enabled
            user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat()

        # mark internal users with the internal user flag so we can differentiate user types when
        # calculating various stats
        if email.endswith("@jurispect.com") or email.endswith("@compliance.ai"):
            user.is_internal_user = True

        if marketing_campaign is not None:
            user.marketing_campaigns.append(marketing_campaign)
        user.gen_reset_token()

        enabled_at_start = False

        try:
            _send_activation_email('confirm', user)
        except SMTPException:
            db_session_users.rollback()
            return error_response('Could not send email', code=500)

    else:
        user.enabled = True

        user.update_password(new_password)
        if first_name:
            user.first_name = first_name
        if last_name:
            user.last_name = last_name

        # only allow the token to be used once:
        user.reset_token = None

    new_props = {p: params[p] for p in json_params if params.get(p)}

    # n.b. since this route is shared with password resets, we need to skip updating the activation time
    # when it is a password reset action
    if not enabled_at_start:
        new_props['activation_time'] = datetime.datetime.utcnow().isoformat()

    if not params.get('user_style') and email.endswith('@firstrepublic.com'):
        new_props['user_style'] = 'first-republic'

    if len(new_props) > 0:
        user.properties = merge_two_dicts(user.properties, new_props)

    if is_contributor:
        user.roles = ['contributor']

    # FIXME: this is needed for marketing-campaign sourced users but yields a double commit
    # probably not super noticeable, but should fix if we have the time
    db_session_users.add(user)
    try:
        db_session_users.commit()
    except IntegrityError:
        return error_response()
    db_session_users.refresh(user)

    topic_ids = []
    topic_ids.extend(params.get('topics', AggregatedAnnotations.topic_id_name_mapping.keys()))
    for topic_id in topic_ids:
        userTopic = UserTopic({
            'user_id': user.id,
            'topic_id': topic_id,
            'following': True
        })
        db_session_users.add(userTopic)

    news_ids = [x['id'] for x in jsearch.query_records({'size': 1000}, doc_type='news_sources')]
    for news_id in news_ids:
        userFollowedEntity = UserFollowedEntity({
            'user_id': user.id,
            'entity_id': news_id,
            'entity_type': 'news_sources',
            'following': True
        })
        db_session_users.add(userFollowedEntity)

    agency_ids = []
    agency_ids.extend(params.get('agencies', []))

    new_ids = []

    # verify that the agency ids are correct
    # using DefaultAgenciesToFollowAtSignup since users now skip onboarding
    for agency_id in DefaultAgenciesToFollowAtSignup:
        try:
            agency = jsearch.get_record(agency_id, 'agencies')
            new_ids.append(agency['id'])
        except NotFoundError:
            pass

    for agency_id in new_ids:
        user_agency = UserAgency({'user_id': user.id, 'agency_id': agency_id, 'following': True})
        db_session_users.add(user_agency)

    state_jurisdictions = []
    state_jurisdictions.extend(params.get('state_agencies', []))
    state_ids = []

    # get selected state jurisdiction ids and add them to follow entity table
    for state_jurisdiction in state_jurisdictions:
        try:
            state = get_state_by_short_name('jurisdictions', state_jurisdiction)
            state_ids.append(state['id'])
        except NotFoundError:
            pass

    updated_followed_entity(user.id, {'entities': [{ 'entity_id': state_id, 'entity_type': 'jurisdictions', 'following': True } for state_id in state_ids]})

    # send a support mail if the user requests a new source
    other_agencies = params.get('other_agencies', '')
    other_state_agencies = params.get('other_state_agencies', '')
    other_topics = params.get('other_topics', '')

    if other_agencies or other_state_agencies or other_topics:
        template_vars = {
            'other_agencies': other_agencies,
            'other_state_agencies': other_state_agencies,
            'other_topics': other_topics,
            'name': '%s %s' % (first_name, last_name),
            'email': email,
        }
        email_helper.send_email(
            '*****@*****.**',
            '*****@*****.**',
            'A new user has requested additional sources or topics',
            template='additional-sources',
            vars=template_vars,
        )

    try:
        db_session_users.commit()
    except IntegrityError:
        return error_response()

    # start free trials.
    user = db_session_users.query(User).filter_by(email=email.lower()).first()
    latest_subscription = db_session_users.query(Subscription).filter_by(user_id=user.id, latest=True).first()
    if latest_subscription is None:
        # new users with .edu email get a 120 month free trial.
        if user.email.endswith('.edu'):
            subscribe_users_to_plan([user.id],'free_trial_120months')

        # all other users get a 1 month free trial
        else:
            start_free_trial(user.id)

    create_folder(user.id, {'name': 'Read'})
    create_folder(user.id, {'name': 'Bookmarked'})
    if confirmation_required:
        # special case login for unenabled marketing campaign users allow access for 7 days only
        expiration_datetime = datetime.datetime.utcnow() + datetime.timedelta(days=7)
        token = jwt.encode({'user_id': user.id, 'exp': expiration_datetime}, SECRET_JWT)
        # Flag 'is_new' defines if user is returning or just registered. True means just registered user
        return jsonify({"jwt_token": token, "is_new": True, 'email': email.lower()})
    else:
        # return empty if user not from marketing campaign
        return jsonify({})
def get_aggregated_annotations(topic_id, params):

    """
    For each row in metadata table view, have dict that gives immediate row AND the expanded view that comes
    from clicking on that row. All this data is returned in paginated batches (default 20 at a time).


    INPUT:

        topic_id: id name of topic (REQUIRED)
                  see list of available topic names and their ids at
                  AggregatedAnnotations.topic_id_name_mapping (dict of {id, topic_name} pairs)

        Params (dict)

        For filter/sort, have specific key for each type of filter/sort required
        For sorting, the value of the key doesn't matter (can be None).
        For filtering, the key is the column name and the value is the value to filter on.
        Available keys can be found in the "filtering and sorting" section in code below.

        params["limit"]: pagination size (optional)
        params["count_only"]: return only size of query (optional)
        params["offset"]: offset for pagination (optional)


    OUTPUT: return_dict (dict)

        return_dict looks like this:

        {
        'aggregated_annotations': agg_annotations,
        'total': job_count_total
        }

        job_count_total is integer count of how many aggregated_annotations would be returned with this filtering
                in the query BEFORE taking into account pagination

        agg_annotations is a list of aggregated_annotation dictionaries
        Each aggregated_annotation dict has all keys from aggregated_annotations.to_dict(), we well as:

        "annotation_task_topic_group_name": name of corresponding annotation task topic group
        "annotation_task_topic_group_description": description of corresponding annotation task topic group
        "doc_title": title of document (as pulled from elasticsearch)
        "judges": list of email strings of topic judges on document
        "topic_annotations": topic_annotation dict, with additional "annotation_job" key
                             for corresponding annotation_job dict
    """

    QUERY_SIZE_LIMIT = 20  # size limit on query for postgres pagination

    topic_dict = AggregatedAnnotations.topic_id_name_mapping

    # get base query - all aggregated annotations under a specific topic
    # for now topic_id is required - to make it optional, later querying will have to be modified
    if topic_id:
        base_query = db_session_users.query(AggregatedAnnotations).filter_by(topic_id=topic_id)
    else:
        return jsonify({'error': "get_aggregated_annotations route requires valid topic_id"}), 400

    ##########################
    # filtering and sorting
    ##########################

    # filtering
    # top-level keys whose values give the value to filter on
    if 'filter_doc_id' in params:
        base_query = base_query.filter_by(doc_id=params['filter_doc_id'])
    if 'filter_is_in_agreement' in params:
        base_query = base_query.filter_by(is_in_agreement=params['filter_is_in_agreement'])
    if 'filter_is_gold_standard' in params:
        base_query = base_query.filter_by(is_gold_standard=params['filter_is_gold_standard'])
    if 'filter_is_active_for_gold_annotation' in params:
        base_query = base_query.filter_by(is_active_for_gold_annotation=params['filter_is_active_for_gold_annotation'])
    if 'filter_gold_difficulty' in params:
        base_query = base_query.filter_by(gold_difficulty=params['filter_gold_difficulty'])

    # sorting
    # NB: the values here don't matter - just the presence of the key
    if 'sorting_doc_id' in params:
        if params['sorting_doc_id'] == 'ascending':
            base_query = base_query.order_by(AggregatedAnnotations.doc_id.asc())
        if params['sorting_doc_id'] == 'descending':
            base_query = base_query.order_by(AggregatedAnnotations.doc_id.desc())
    if 'sorting_is_gold_standard' in params:
        if params['sorting_is_gold_standard'] == 'ascending':
            base_query = base_query.order_by(AggregatedAnnotations.is_gold_standard.asc())
        if params['sorting_is_gold_standard'] == 'descending':
            base_query = base_query.order_by(AggregatedAnnotations.is_gold_standard.desc())
    if 'sorting_gold_difficulty' in params:
        if params['sorting_gold_difficulty'] == 'ascending':
            base_query = base_query.order_by(AggregatedAnnotations.gold_difficulty.asc())
        if params['sorting_gold_difficulty'] == 'descending':
            base_query = base_query.order_by(AggregatedAnnotations.gold_difficulty.desc())
    if 'sorting_is_in_agreement' in params:
        if params['sorting_is_in_agreement'] == 'ascending':
            base_query = base_query.order_by(AggregatedAnnotations.is_in_agreement.asc())
        if params['sorting_is_in_agreement'] == 'descending':
            base_query = base_query.order_by(AggregatedAnnotations.is_in_agreement.desc())
    if 'sorting_arbitrary_tags' in params:
        if params['sorting_arbitrary_tags'] == 'ascending':
            base_query = base_query.order_by(AggregatedAnnotations.arbitrary_tags.asc())
        if params['sorting_arbitrary_tags'] == 'descending':
            base_query = base_query.order_by(AggregatedAnnotations.arbitrary_tags.desc())
    if 'sorting_notes' in params:
        if params['sorting_notes'] == 'ascending':
            base_query = base_query.order_by(AggregatedAnnotations.notes.asc())
        if params['sorting_notes'] == 'descending':
            base_query = base_query.order_by(AggregatedAnnotations.notes.desc())

    #####################################################################
    # do pagination with offset (see annotation_job_helper.py, line 89)
    #####################################################################

    # get the total number of aggregated annotations found BEFORE the pagination limit+offset breakdown
    job_count_total = base_query.count()

    # return only count if necessary
    if 'count_only' in params:
        return jsonify({'total': job_count_total})

    # n.b. allows pagination
    if 'offset' in params:
        base_query = base_query.offset(params['offset'])

    # n.b. 20 seems a reasonable default limit - can be changed depending on needs
    limit = params.get('limit', QUERY_SIZE_LIMIT)
    base_query = base_query.limit(limit)


    ############################################
    # create list of dicts that will be returned
    ############################################
    agg_annotations = [agg.to_dict() for agg in base_query]

    ################################################################
    # add name and description of each annotation task topic group
    ################################################################
    for agg in agg_annotations:
        group = db_session_users.query(AnnotationTaskTopicGroup).filter_by(id=agg['annotation_task_group_id']).first()
        agg['annotation_task_topic_group_name'] = group.name
        agg['annotation_task_topic_group_description'] = group.description

    ####################################
    # query Elasticsearch and postgres
    ####################################

    # get doc_ids for queries in ES and postgres
    doc_ids = [agg["doc_id"] for agg in agg_annotations]

    # Elasticsearch query to get document titles
    query = {
        "size": len(doc_ids),
        "query": {
            "bool": {
                "must": {
                    "terms": {"id": doc_ids}
                }
            }
        },
        "_source": {"include": ["id", "title"]}
    }
    # make doc_title_dict keyed by doc_id for fast lookup
    doc_title_dict = {d["id"]: d["title"] for d in jsearch.query_records(query, 'documents')}
    # N.B.: jsearch.query_records(query, 'documents') returns array with elements of form
    #                                             {'id': 240394, 'title': "Blah blah blah"}

    # get all topic_annotations with this topic and these doc_ids
    # subqueryload used to load AnnotationJob objects explicitly
    topic_name = None
    if topic_id is not None:
        topic_name = topic_dict[topic_id]
    # topic_name = None
    ta_query_result = db_session_users.query(TopicAnnotation)\
                                      .options(subqueryload(TopicAnnotation.annotation_job))\
                                      .options(subqueryload(TopicAnnotation.user))\
                                      .filter(TopicAnnotation.doc_id.in_(doc_ids))\

    if topic_id is not None:
        ta_query_result = ta_query_result.filter_by(topic_name=topic_dict[topic_id])

    ta_query_result = ta_query_result.all()
    ############################################################
    # aggregated desired fields from ES/postgres query results
    ############################################################

    # make doc_id-keyed dict of selected agg_ants for efficiently aggregating results
    agg_annot_dict = {agg["doc_id"]: agg for agg in agg_annotations}

    # set up empty lists to store topic_annotations and judges in each agg dict;
    # collect document titles into each agg dict
    for agg in agg_annotations:
        agg["topic_annotations"] = []
        agg["judges"] = []
        if agg["doc_id"] in doc_title_dict:
            agg["doc_title"] = doc_title_dict[agg["doc_id"]]
    # collect TopicAnnotations, AnnotationJobs and judges into agg dictionaries
    for ta in ta_query_result:
        matching_agg = agg_annot_dict[ta.doc_id]  # get relevant aggregated_annotation
        ta_dict = ta.to_dict()  # get topic_annotation dict
        ta_dict["annotation_job"] = ta.annotation_job.to_dict()  # using subqueryload of annotation_job
        matching_agg["topic_annotations"].append(ta_dict)  # TopicAnnotation
        matching_agg["judges"].append(ta.user.email)  # append to list of judges (used subquery here as well)

        # NB: in case we want to include first_name and last_name at a later date:
        # matching_agg["judges"].append(ta.user.first_name + ta.user.last_name)  # append to list of judges
                                                                               # (used subqueryload here as well)

    # return result
    return jsonify({'aggregated_annotations': agg_annotations, 'total': job_count_total})