Exemplo n.º 1
0
    def expand_doc_node(self, doc_id):

        doc = jsearch.get_record(doc_id)

        for k in doc['children']:
            self.graph['nodes'].add(doc_name(k['id']))
            self.graph['edges'].add(
                (doc_name(k['id']), doc_name(doc_id), 'is_part_of'))

        cites = doc['cited_associations']

        for a in cites['act_ids']:
            self.graph['nodes'].add(node_name(a, 'acts'))
            self.graph['edges'].add(
                (doc_name(doc_id), node_name(a, 'acts'), 'references'))

        for r in cites['named_regulation_ids']:
            self.graph['nodes'].add(node_name(r, 'named_regulations'))
            self.graph['edges'].add(
                (doc_name(doc_id), node_name(r, 'named_regulations'),
                 'references'))

        incoming = doc['incoming_citation_ids']

        if not incoming:
            return None

        for d in incoming[0:25]:
            self.graph['nodes'].add(doc_name(d))
            self.graph['edges'].add(
                (doc_name(d), doc_name(doc_id), 'is_cited_by'))
Exemplo n.º 2
0
def get_document_by_id(doc_id, params):
    try:
        decorate_children = params.get('decorate_children', False)
        skip_unused_fields = params.get('skip_unused_fields', False)
        skip_fields_for_state_code = params.get('skip_fields_for_state_code', False)
        skip_fields_for_right_panel = params.get('skip_fields_for_right_panel', False)

        # skip the unused fields if the request told us to do so
        es_params = {}
        if skip_unused_fields:
            if skip_fields_for_state_code:
                fields_to_skip = UNUSED_FIELDS_FOR_STATE_CODE
            elif skip_fields_for_right_panel:
                fields_to_skip = UNUSED_FIELDS_FOR_RIGHT_PANEL
            else:
                fields_to_skip = UNUSED_FIELDS_BY_ALL

            # n.b. yes this needs to be a comma separated string for some reason
            es_params["_source_exclude"] = ",".join(fields_to_skip)

        doc = jsearch.get_record(doc_id, params=es_params)

        docs = [doc] # hack to re-use same methods as get all documents code
        decorated_docs = decorate_documents(docs, g.user_id, decorate_children=decorate_children)

        # TODO: Allow turning this off to return all topics with their probabilities by an optional flag
        topic_filtered_docs = apply_threshold_topics_in_document_response(decorated_docs)

        return jsonify({'document': topic_filtered_docs[0]})
    except elasticsearch.NotFoundError:
        return jsonify({"errors": "Not found"}), 404
Exemplo n.º 3
0
 def set_seed_docids(self):
     data = jsearch.get_record(self.act_id, 'acts')
     if not data['doc_ids']:
         print "Seed Neighborhood not found for {} ...".format(
             self.root_key)
         return False
     self.seed_docids = data['doc_ids']
def pop_topic_judgment():
    while True:
        topic_judgment = get_topic_judgment_for_user()
        try:
            if topic_judgment:
                doc = jsearch.get_record(topic_judgment.doc_id)
            break
        except NotFoundError:
            topic_judgment.status = 'bad_doc'
            db_session_users.add(topic_judgment)
            db_session_users.commit()

    if topic_judgment:
        topic_judgment.status = 'assigned'
        topic_judgment.user_id = g.user_id
        db_session_users.add(topic_judgment)
        db_session_users.commit()
    else:
        return {'queue': 'empty'}

    user = db_session_users.query(User).filter_by(id=g.user_id).first()

    return {
        'id': topic_judgment.id,
        'status': topic_judgment.status,
        'judgment': topic_judgment.judgment,
        'document': doc,
        'user': user.to_dict(),
        'topic_name': topic_judgment.topic_name
    }
Exemplo n.º 5
0
def document_timelines(doc_id, params, user_id):
    # n.b. only need the dockets themselves for this query
    es_params = {"_source_include": "dockets"}
    doc = jsearch.get_record(doc_id, params=es_params)

    docket_ids = [d["docket_id"] for d in doc["dockets"]]
    timelines = {}
    for dok_id in docket_ids:
        timelines[dok_id] = docket_timeline(dok_id, params, user_id)
    return timelines
Exemplo n.º 6
0
def get_annotation_job_by_id(annotation_task_id, annotation_job_id, user_id):
    # n.b. user_id is redundant but this should prevent shenanigans here
    annotation_job = db_session_users.query(AnnotationJob).\
        filter_by(id=annotation_job_id, user_id=user_id).first()

    annotation_job_dict = annotation_job.to_dict()
    # n.b. i deliberately left the user_id restriction here in case future QA tasks might allow super annotators
    # to edit user annotations
    topic_annotations = db_session_users.query(TopicAnnotation).filter_by(annotation_job_id=annotation_job_id)
    annotation_job_dict['topic_annotations'] = [t.to_dict() for t in topic_annotations]

    doc_dict = jsearch.get_record(annotation_job.doc_id)
    return {'annotation_job': annotation_job_dict, 'document': doc_dict}
Exemplo n.º 7
0
def find_or_return_new_search_query(search_args,
                                    save_and_refresh_if_new=False):
    search_args = sanitize_search_args(search_args)
    search_args_hash = hashlib.sha1(json.dumps(search_args)).hexdigest()
    search_entry = db_session_users.query(SearchQuery).filter_by(
        search_args_hash=search_args_hash).first()

    if search_entry is None:
        # for proposed filters, this is false, as we've curated it, but if there is no filter, and this is a query
        # search, then we mark it as true
        is_arbitrary_query = False

        # For the very first time only, we need to figure out the display name - and avoid extra API queries
        # per update or per fetch of the top 5 options
        display_name = None
        if "agency_id" in search_args:
            agency = jsearch.get_record(search_args['agency_id'], "agencies")
            display_name = agency["name"]
        elif "act_id" in search_args:
            act = jsearch.get_record(search_args['act_id'], "acts")
            display_name = act["name"]
        elif "regulation_id" in search_args:
            reg = jsearch.get_record(search_args['regulation_id'],
                                     "named_regulations")
            display_name = reg["name"]
        elif "citation_id" in search_args:
            cite = jsearch.get_record(search_args['citation_id'], "citations")
            display_name = cite["name"]
        elif "concept_id" in search_args:
            concept = jsearch.get_record(search_args['concept_id'], "concepts")
            display_name = concept["name"]
        elif "bank_id" in search_args:
            bank = jsearch.get_record(search_args['bank_id'], "banks")
            display_name = bank["name"]
        elif "topic_id" in search_args:
            topic = jsearch.get_record(search_args['topic_id'], "topics")
            display_name = topic["name"]
        elif "query" in search_args:
            is_arbitrary_query = True

        # TODO should we have a rollback case for possible race conditions on the create call
        search_entry = SearchQuery(search_args=search_args,
                                   display_name=display_name,
                                   is_arbitrary_query=is_arbitrary_query)

        # in the new case, when this flag is set, save and refresh the value
        if save_and_refresh_if_new:
            db_session_users.add(search_entry)
            db_session_users.commit()
            db_session_users.refresh(search_entry)

    return search_entry
Exemplo n.º 8
0
    def test_cover_page(self):
        es_doc = jsearch.get_record(1) # XXX real id

        doc = Document(es_doc)

        cover_file = open('/tmp/cover.pdf', 'w')

        create_cover_page(
            title="Check out this cool document",
            table_contents=table_contents,
            text_para=summary,
            file_obj=cover_file
        )

        cover_file.close()
def get_agency_info_by_id(agency_id):
    if agency_id is None:
        return jsonify({"errors": "No agency_id param"}), 400
    ret_agency = {}
    es_params = {"_source_include": ",".join(INCLUDED_AGENCY_FIELDS)}
    try:
        ret_agency = jsearch.get_record(agency_id,
                                        doc_type='agencies',
                                        params=es_params)
    except Exception as e:
        return jsonify({"errors": "Not found"}), 404
    if ret_agency:
        return ret_agency, 200
    else:
        return jsonify({"errors": "Not found"}), 404
def get_entity_by_type_and_id(entity_type, entity_id):

    if entity_type not in ALLOWED_ENTITY_TYPES:
        return {"errors": "invalid type"}
    return jsearch.get_record(entity_id, entity_type)
def get_entity_from_es(user_followed_entity):
    return {
        'entity':
        jsearch.get_record(user_followed_entity.entity_id,
                           user_followed_entity.entity_type)
    }
Exemplo n.º 12
0
def pop_annotation_job_from_queue(annotation_task_id, user_id):
    time_now = datetime.datetime.now()

    # grabs queued annotation jobs for this task that are assigned to the user (or nobody),
    # ordered first by whether they are have a user assignment, next by highest priority,
    # and finally falling back on the oldest created
    annotation_job = db_session_users.query(AnnotationJob).filter_by(annotation_task_id=annotation_task_id)\
        .filter_by(status=AnnotationJob.QUEUED_STATUS)\
        .filter(or_(AnnotationJob.user_id == user_id, AnnotationJob.user_id == None))\
        .order_by(AnnotationJob.user_id.nullslast(), AnnotationJob.priority.desc(), AnnotationJob.created_at.asc()).first()

    # if by chance, we are in the period of time between when a task was updated, but before the next queuing run
    # came around, we want to make sure to look up annotation jobs for older annotation tasks too
    if annotation_job is None:
        old_annotation_task_ids = db_session_users.query(AnnotationTask.id).filter_by(active_task_id=annotation_task_id).subquery()
        annotation_job = db_session_users.query(AnnotationJob)\
            .filter(AnnotationJob.annotation_task_id.in_(old_annotation_task_ids)) \
            .filter_by(status=AnnotationJob.QUEUED_STATUS) \
            .filter(or_(AnnotationJob.user_id == user_id, AnnotationJob.user_id == None)) \
            .order_by(AnnotationJob.user_id.nullslast(), AnnotationJob.priority.desc(), AnnotationJob.created_at.asc()).first()

    if annotation_job is None:
        return {"annotation_job": None}

    annotation_job.status = AnnotationJob.ASSIGNED_STATUS
    annotation_job.user_id = user_id
    annotation_job.assigned_at = time_now

    db_session_users.add(annotation_job)
    db_session_users.commit()
    db_session_users.refresh(annotation_job)

    # n.b. mitigation strategy for race condition would look like: while the assigned user_id is not me -> query again
    # change status to error status if document is not found in index
    try:
        doc_dict = jsearch.get_record(annotation_job.doc_id)
    except NotFoundError:
        annotation_job.status = AnnotationJob.ERROR_STATUS
        annotation_job.notes = "Document is not found"
        db_session_users.add(annotation_job)
        db_session_users.commit()
        db_session_users.refresh(annotation_job)
        return {"errors": "Document is not found. Doc ID: " + str(annotation_job.doc_id)}

    # if this is training job, return info about correct judgment
    if annotation_job.is_gold_evaluation:
        # get gold judgment info to return with annotation_job object
        topic_group_id_subquery = db_session_users.query(AnnotationTask.annotation_task_topic_group_id)\
                                                  .filter_by(id=annotation_job.annotation_task_id)\
                                                  .subquery()  # should contain just one result
        gold_judgment_id_subquery = db_session_users.query(AggregatedAnnotations.gold_topic_annotation_id)\
                                                    .filter_by(doc_id=annotation_job.doc_id)\
                              .filter(AggregatedAnnotations.annotation_task_group_id.in_(topic_group_id_subquery))\
                                                    .subquery()
        gold_judgment_object = db_session_users.query(TopicAnnotation.is_positive,
                                                      TopicAnnotation.admin_notes)\
                                               .filter(TopicAnnotation.id.in_(gold_judgment_id_subquery))\
                                               .first()  # this query should return just one object anyway
        return {'annotation_job': annotation_job.to_dict(),
                'document': doc_dict,
                'correct_judgment': gold_judgment_object.is_positive,
                'correct_judgment_notes': gold_judgment_object.admin_notes}

    return {'annotation_job': annotation_job.to_dict(), 'document': doc_dict}
def activate_user(params):
    email = params.get('email')
    token = params.get('token')
    new_password = params.get('new_password')
    first_name = params.get('first_name')
    last_name = params.get('last_name')

    is_contributor = params.get('is_contributor')
    dry_run = params.get('dry_run', False)  # validate token, email, enabled state only

    linkedin_id = params.get('linkedin_id')
    google_id = params.get('google_id')
    enabled = params.get('enabled')

    # params to go into json field in db
    json_params = [
        'agencies', 'state_agencies',
        'other_agencies', 'other_state_agencies', 'other_topics',
        'user_style'
    ]

    def error_response(msg='Invalid email or token'):
        response = jsonify({
            'error': msg,
        })
        response.status_code = 400
        return response

    # confirmation_required variable tracks whether this is an activation sourced from a marketing campaign,
    # a signup withouot a token, or from the invite -> activate flow.
    # use confirmation_required to indicate we need to send a confirmation email later on
    confirmation_required = False
    marketing_campaign = db_session_users.query(MarketingCampaign).filter_by(token=token).first()
    if marketing_campaign is not None or token is None:
        confirmation_required = True
    else:
        if email is None:
            return error_response()
        else:
            email = email.lower()
            g.user_email = email

        user = db_session_users.query(User).filter_by(email=email).scalar()

        if user is None:
            return error_response()

        if dry_run and user.enabled:
            return error_response('User is already enabled')

        enabled_at_start = user.enabled

        if not user.reset_token or user.reset_token != token:
            # send an email to support, but only if the user is in the db to prevent spamming
            if dry_run:
                template_vars = {
                    'email': email,
                }
                email_helper.send_email(
                    '*****@*****.**',
                    '*****@*****.**',
                    'A user attempted to use an invalid token during activation',
                    template='activate-fail',
                    vars=template_vars,
                )

            return error_response()

    if dry_run:
        return jsonify({'marketing_campaign': marketing_campaign is not None})

    if not new_password:
        return error_response('Missing fields')

    # for the marketing campaign approach, create an entry in the users table,
    # for the invite-based registration approach, mark the user enabled
    if confirmation_required:
        email = email.lower()
        g.user_email = email

        # check if this user exists in the database (the invite use-case), so we can use the existing entry if so
        # and create a new entry if not
        user = db_session_users.query(User).filter_by(email=email).first()

        # this is for when a user comes to our site without being invited through the admin tool
        if user is None:
            user = User({
                'email': email,
                'first_name': first_name,
                'last_name': last_name,
                'password': new_password,
                'enabled': False,
            })


        # this is for when the user is instead invited to our site, but then instead of trying to enter via the
        # invitation link, they use the regular user signup flow. they will now get the confirmation email
        # and have to fully activate their account there
        else:
            # triple check to prevent any shenanigans for enabled users, or user accounts
            # that somehow exist but were not invited, and also if the invite has already been skipped
            # and we have successfully moved onto the confirmation step
            # n.b. relying on hash values is a little funky here, but it seems to work
            if user.enabled or "invited_by" not in user.properties or "invite_skipped" in user.properties:
                return error_response()

            user.properties["invite_skipped"] = True  # n.b. record that the invite workflow was skipped
            user.first_name = first_name
            user.last_name = last_name
            user.update_password(new_password)

        if linkedin_id:
            user.linkedin = linkedin_id
            user.industry = params.get('industry')
            user.company = params.get('company')
            user.properties['linkedin_data'] = params.get('linkedin_data')
            user.enabled = enabled
            user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat()

        if google_id:
            user.google_id = google_id
            user.enabled = enabled
            user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat()

        # mark internal users with the internal user flag so we can differentiate user types when
        # calculating various stats
        if email.endswith("@jurispect.com") or email.endswith("@compliance.ai"):
            user.is_internal_user = True

        if marketing_campaign is not None:
            user.marketing_campaigns.append(marketing_campaign)
        user.gen_reset_token()

        enabled_at_start = False

        try:
            _send_activation_email('confirm', user)
        except SMTPException:
            db_session_users.rollback()
            return error_response('Could not send email', code=500)

    else:
        user.enabled = True

        user.update_password(new_password)
        if first_name:
            user.first_name = first_name
        if last_name:
            user.last_name = last_name

        # only allow the token to be used once:
        user.reset_token = None

    new_props = {p: params[p] for p in json_params if params.get(p)}

    # n.b. since this route is shared with password resets, we need to skip updating the activation time
    # when it is a password reset action
    if not enabled_at_start:
        new_props['activation_time'] = datetime.datetime.utcnow().isoformat()

    if not params.get('user_style') and email.endswith('@firstrepublic.com'):
        new_props['user_style'] = 'first-republic'

    if len(new_props) > 0:
        user.properties = merge_two_dicts(user.properties, new_props)

    if is_contributor:
        user.roles = ['contributor']

    # FIXME: this is needed for marketing-campaign sourced users but yields a double commit
    # probably not super noticeable, but should fix if we have the time
    db_session_users.add(user)
    try:
        db_session_users.commit()
    except IntegrityError:
        return error_response()
    db_session_users.refresh(user)

    topic_ids = []
    topic_ids.extend(params.get('topics', AggregatedAnnotations.topic_id_name_mapping.keys()))
    for topic_id in topic_ids:
        userTopic = UserTopic({
            'user_id': user.id,
            'topic_id': topic_id,
            'following': True
        })
        db_session_users.add(userTopic)

    news_ids = [x['id'] for x in jsearch.query_records({'size': 1000}, doc_type='news_sources')]
    for news_id in news_ids:
        userFollowedEntity = UserFollowedEntity({
            'user_id': user.id,
            'entity_id': news_id,
            'entity_type': 'news_sources',
            'following': True
        })
        db_session_users.add(userFollowedEntity)

    agency_ids = []
    agency_ids.extend(params.get('agencies', []))

    new_ids = []

    # verify that the agency ids are correct
    # using DefaultAgenciesToFollowAtSignup since users now skip onboarding
    for agency_id in DefaultAgenciesToFollowAtSignup:
        try:
            agency = jsearch.get_record(agency_id, 'agencies')
            new_ids.append(agency['id'])
        except NotFoundError:
            pass

    for agency_id in new_ids:
        user_agency = UserAgency({'user_id': user.id, 'agency_id': agency_id, 'following': True})
        db_session_users.add(user_agency)

    state_jurisdictions = []
    state_jurisdictions.extend(params.get('state_agencies', []))
    state_ids = []

    # get selected state jurisdiction ids and add them to follow entity table
    for state_jurisdiction in state_jurisdictions:
        try:
            state = get_state_by_short_name('jurisdictions', state_jurisdiction)
            state_ids.append(state['id'])
        except NotFoundError:
            pass

    updated_followed_entity(user.id, {'entities': [{ 'entity_id': state_id, 'entity_type': 'jurisdictions', 'following': True } for state_id in state_ids]})

    # send a support mail if the user requests a new source
    other_agencies = params.get('other_agencies', '')
    other_state_agencies = params.get('other_state_agencies', '')
    other_topics = params.get('other_topics', '')

    if other_agencies or other_state_agencies or other_topics:
        template_vars = {
            'other_agencies': other_agencies,
            'other_state_agencies': other_state_agencies,
            'other_topics': other_topics,
            'name': '%s %s' % (first_name, last_name),
            'email': email,
        }
        email_helper.send_email(
            '*****@*****.**',
            '*****@*****.**',
            'A new user has requested additional sources or topics',
            template='additional-sources',
            vars=template_vars,
        )

    try:
        db_session_users.commit()
    except IntegrityError:
        return error_response()

    # start free trials.
    user = db_session_users.query(User).filter_by(email=email.lower()).first()
    latest_subscription = db_session_users.query(Subscription).filter_by(user_id=user.id, latest=True).first()
    if latest_subscription is None:
        # new users with .edu email get a 120 month free trial.
        if user.email.endswith('.edu'):
            subscribe_users_to_plan([user.id],'free_trial_120months')

        # all other users get a 1 month free trial
        else:
            start_free_trial(user.id)

    create_folder(user.id, {'name': 'Read'})
    create_folder(user.id, {'name': 'Bookmarked'})
    if confirmation_required:
        # special case login for unenabled marketing campaign users allow access for 7 days only
        expiration_datetime = datetime.datetime.utcnow() + datetime.timedelta(days=7)
        token = jwt.encode({'user_id': user.id, 'exp': expiration_datetime}, SECRET_JWT)
        # Flag 'is_new' defines if user is returning or just registered. True means just registered user
        return jsonify({"jwt_token": token, "is_new": True, 'email': email.lower()})
    else:
        # return empty if user not from marketing campaign
        return jsonify({})
Exemplo n.º 14
0
    if not user_interval_preference[opts.job_interval] and not opts.force:
        print(user.email + " is not configured for " + opts.job_interval)
        continue

    followed_agency_ids = get_followed_agency_ids_with_backoff(user.id)

    agency_overview = {}

    for followed_agency_id in followed_agency_ids:
        # n.b. need to figure out a better way to filter out state code
        if followed_agency_id > 999:
            continue
        # n.b. we need to rescue elasticsearch not found errors here because we don't guarantee consistency
        # between the followed agency ids stored in the user db and any updates in
        try:
            agency_name = jsearch.get_record(str(followed_agency_id),
                                             'agencies')['short_name']
        except elasticsearch.exceptions.NotFoundError:
            continue

        agency_overview[agency_name] = {
            "types": {},
            "agency_id": followed_agency_id,
            "published_from": overview_from_date,
            "published_to": today_str
        }

        for doc_type in distinct_document_types:
            if doc_type == 'Enforcement Metadata' or doc_type == 'Mainstream News':
                continue

            params = MultiDict({
Exemplo n.º 15
0
from lxml import etree
from dateutil import parser
from werkzeug.datastructures import MultiDict

this_folder = os.path.dirname(os.path.realpath(__file__))
sys.path.append(this_folder + '/..')

import schemas.jurasticsearch as jsearch
from helpers.document_helper import get_filtered_documents

distinct_document_types = jsearch.get_distinct_attribute_values('category')

document_id = 1705526
comments_date = "01/17/2017"
effective_date = None
doc = jsearch.get_record(document_id)

#json.dump(doc, open('socgen.json', 'wb'))

document = etree.Element("document")

def create_basic_doc(document, doc_dict):
    basic_fields = ['title', 'category', 'pdf_url', 'web_url']
    for field_name in basic_fields:
        field_entry = etree.SubElement(document, field_name)
        field_entry.text = doc_dict[field_name]
    publication_date = etree.SubElement(document, "publication_date")
    publication_date.text = parser.parse(doc_dict['publication_date']).strftime("%m/%d/%Y")
    sources = etree.SubElement(document, "sources")
    for a in doc_dict['agencies']:
        source = etree.SubElement(sources, "source")