Пример #1
0
def create_docs_index():
    """
    Initialize Elasticsearch for storing legal documents.
    Create the `docs` index, and set up the aliases `docs_index` and `docs_search`
    to point to the `docs` index. If the `doc` index already exists, delete it.
    """

    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs'")
        es.indices.delete('docs')
    except elasticsearch.exceptions.NotFoundError:
        pass

    try:
        logger.info("Delete index 'docs_index'")
        es.indices.delete('docs_index')
    except elasticsearch.exceptions.NotFoundError:
        pass

    logger.info("Create index 'docs'")
    es.indices.create(
        'docs', {
            "mappings": MAPPINGS,
            "settings": ANALYZER_SETTINGS,
            "aliases": {
                'docs_index': {},
                'docs_search': {}
            }
        })
Пример #2
0
def move_archived_murs():
    '''
    Move archived MURs from `docs` index to `archived_murs_index`
    This should only need to be run once.
    Once archived MURs are on their own index, we will be able to
    re-index current legal docs after a schema change much more quickly.
    '''
    es = utils.get_elasticsearch_connection()

    body = {
          "source": {
            "index": "docs",
            "type": "murs",
            "query": {
              "match": {
                "mur_type": "archived"
              }
            }
          },
          "dest": {
            "index": "archived_murs"
          }
        }

    logger.info("Copy archived MURs from 'docs' index to 'archived_murs' index")
    es.reindex(body=body, wait_for_completion=True, request_timeout=1500)
Пример #3
0
def get_title_26_statutes():
    es = utils.get_elasticsearch_connection()

    title_parsed = get_xml_tree_from_url('http://uscode.house.gov/download/' +
                    'releasepoints/us/pl/114/219/[email protected]')
    tag_name = '{{http://xml.house.gov/schemas/uslm/1.0}}{0}'
    for subtitle in title_parsed.iter(tag_name.format('subtitle')):
        if subtitle.attrib['identifier'] == '/us/usc/t26/stH':
            for chapter in subtitle.iter(tag_name.format('chapter')):
                match = re.match("/us/usc/t26/stH/ch([0-9]+)",
                                    chapter.attrib['identifier'])
                chapter_no = match.group(1)
                for section in chapter.iter(tag_name.format('section')):
                    text = ''
                    for child in section.iter():
                        if child.text:
                            text += ' %s ' % child.text.strip()
                    heading = section.find(tag_name.format('heading')).text.strip()
                    section_no = re.match('/us/usc/t26/s([0-9]+)',
                             section.attrib['identifier']).group(1)
                    pdf_url = 'http://api.fdsys.gov/link?collection=uscode&' +\
                              'title=26&year=mostrecent&section=%s'\
                              % section_no
                    doc = {"doc_id": section.attrib['identifier'],
                           "text": text,
                           "name": heading,
                           "no": section_no,
                           "title": "26",
                           "chapter": chapter_no,
                           "url": pdf_url}
                    es.index('docs', 'statutes', doc, id=doc['doc_id'])
Пример #4
0
def load_current_murs():
    es = get_elasticsearch_connection()
    bucket = get_bucket()
    bucket_name = env.get_credential('bucket')
    with db.engine.connect() as conn:
        rs = conn.execute(ALL_MURS)
        for row in rs:
            case_id = row['case_id']
            mur = {
                'doc_id': 'mur_%s' % row['case_no'],
                'no': row['case_no'],
                'name': row['name'],
                'mur_type': 'current',
            }
            mur['subject'] = {"text": get_subjects(case_id)}

            participants = get_participants(case_id)
            mur['participants'] = list(participants.values())
            mur['disposition'] = get_disposition(case_id)
            mur['text'], mur['documents'] = get_documents(
                case_id, bucket, bucket_name)
            mur['open_date'], mur['close_date'] = get_open_and_close_dates(
                case_id)
            mur['url'] = '/legal/matter-under-review/%s/' % row['case_no']
            es.index('docs', 'murs', mur, id=mur['doc_id'])
Пример #5
0
def create_docs_index():
    """
    Initialize Elasticsearch for storing legal documents.
    Create the `docs` index, and set up the aliases `docs_index` and `docs_search`
    to point to the `docs` index. If the `doc` index already exists, delete it.
    """

    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs'")
        es.indices.delete('docs')
    except elasticsearch.exceptions.NotFoundError:
        pass

    try:
        logger.info("Delete index 'docs_index'")
        es.indices.delete('docs_index')
    except elasticsearch.exceptions.NotFoundError:
        pass

    logger.info("Create index 'docs'")
    es.indices.create('docs', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS,
        "aliases": {
            'docs_index': {},
            'docs_search': {}
        }
    })
Пример #6
0
def create_archived_murs_index():
    """
    Initialize Elasticsearch for storing archived MURs.
    If the `archived_murs` index already exists, delete it.
    Create the `archived_murs` index.
    Set up the alias `archived_murs_index` to point to the `archived_murs` index.
    Set up the alias `docs_search` to point `archived_murs` index, allowing the
    legal search to work across current and archived MURs
    """

    es = utils.get_elasticsearch_connection()

    try:
        logger.info("Delete index 'archived_murs'")
        es.indices.delete('archived_murs')
    except elasticsearch.exceptions.NotFoundError:
        pass

    logger.info("Create index 'archived_murs' with aliases 'docs_search' and 'archived_murs_index'")
    es.indices.create('archived_murs', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS,
        "aliases": {
            'archived_murs_index': {},
            'docs_search': {}
        }
    })
Пример #7
0
def create_staging_index():
    """
    Create the index `docs_staging`.
    Move the alias docs_index to point to `docs_staging` instead of `docs`.
    """
    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs_staging'")
        es.indices.delete('docs_staging')
    except:
        pass

    logger.info("Create index 'docs_staging'")
    es.indices.create('docs_staging', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS,
    })

    logger.info("Move alias 'docs_index' to point to 'docs_staging'")
    es.indices.update_aliases(
        body={
            "actions": [{
                "remove": {
                    "index": 'docs',
                    "alias": 'docs_index'
                }
            }, {
                "add": {
                    "index": 'docs_staging',
                    "alias": 'docs_index'
                }
            }]
        })
Пример #8
0
def index_regulations():
    eregs_api = env.get_credential('FEC_EREGS_API', '')

    if(eregs_api):
        reg_versions = requests.get(eregs_api + 'regulation').json()['versions']
        es = utils.get_elasticsearch_connection()
        reg_count = 0
        for reg in reg_versions:
            url = '%sregulation/%s/%s' % (eregs_api, reg['regulation'],
                                          reg['version'])
            regulation = requests.get(url).json()
            sections = get_sections(regulation)

            print("Loading part %s" % reg['regulation'])
            for section_label in sections:
                doc_id = '%s_%s' % (section_label[0], section_label[1])
                section_formatted = '%s-%s' % (section_label[0], section_label[1])
                reg_url = '/regulations/{0}/{1}#{0}'.format(section_formatted,
                                                            reg['version'])
                no = '%s.%s' % (section_label[0], section_label[1])
                name = sections[section_label]['title'].split(no)[1].strip()
                doc = {"doc_id": doc_id, "name": name,
                       "text": sections[section_label]['text'], 'url': reg_url,
                       "no": no}

                es.index('docs', 'regulations', doc, id=doc['doc_id'])
            reg_count += 1
        print("%d regulation parts indexed." % reg_count)
    else:
        print("Regs could not be indexed, environment variable not set.")
Пример #9
0
def move_archived_murs():
    '''
    Move archived MURs from `docs` index to `archived_murs_index`
    This should only need to be run once.
    Once archived MURs are on their own index, we will be able to
    re-index current legal docs after a schema change much more quickly.
    '''
    es = utils.get_elasticsearch_connection()

    body = {
        "source": {
            "index": "docs",
            "type": "murs",
            "query": {
                "match": {
                    "mur_type": "archived"
                }
            }
        },
        "dest": {
            "index": "archived_murs"
        }
    }

    logger.info(
        "Copy archived MURs from 'docs' index to 'archived_murs' index")
    es.reindex(body=body, wait_for_completion=True, request_timeout=1500)
Пример #10
0
def get_citations(ao_names):
    ao_component_to_name_map = {tuple(map(int, a.split('-'))): a for a in ao_names}

    logger.info("Getting citations...")

    rs = db.engine.execute("""SELECT ao_no, ocrtext FROM aouser.document
                                INNER JOIN aouser.ao USING (ao_id)
                              WHERE category = 'Final Opinion'""")

    all_regulatory_citations = set()
    all_statutory_citations = set()
    raw_citations = defaultdict(lambda: defaultdict(set))
    for row in rs:
        logger.debug("Getting citations for AO %s" % row["ao_no"])

        ao_citations_in_doc = parse_ao_citations(row["ocrtext"], ao_component_to_name_map)
        ao_citations_in_doc.discard(row["ao_no"])  # Remove self

        raw_citations[row["ao_no"]]["ao"].update(ao_citations_in_doc)

        for citation in ao_citations_in_doc:
            raw_citations[citation]["aos_cited_by"].add(row["ao_no"])

        statutory_citations = parse_statutory_citations(row["ocrtext"])
        regulatory_citations = parse_regulatory_citations(row["ocrtext"])
        all_statutory_citations.update(statutory_citations)
        all_regulatory_citations.update(regulatory_citations)
        raw_citations[row["ao_no"]]["statutes"].update(statutory_citations)
        raw_citations[row["ao_no"]]["regulations"].update(regulatory_citations)

    citations = defaultdict(lambda: defaultdict(list))
    for ao in raw_citations:
        citations[ao]["ao"] = sorted([
            {"no": c, "name": ao_names[c]}
            for c in raw_citations[ao]["ao"]], key=lambda d: d["no"])
        citations[ao]["aos_cited_by"] = sorted([
            {"no": c, "name": ao_names[c]}
            for c in raw_citations[ao]["aos_cited_by"]], key=lambda d: d["no"])
        citations[ao]["statutes"] = sorted([
            {"title": c[0], "section": c[1]}
            for c in raw_citations[ao]["statutes"]], key=lambda d: (d["title"], d["section"]))
        citations[ao]["regulations"] = sorted([
            {"title": c[0], "part": c[1], "section": c[2]}
            for c in raw_citations[ao]["regulations"]], key=lambda d: (d["title"], d["part"], d["section"]))

    es = get_elasticsearch_connection()

    for citation in all_regulatory_citations:
        entry = {'citation_text': '%d CFR §%d.%d'
                 % (citation[0], citation[1], citation[2]),'citation_type': 'regulation'}
        es.index('docs_index', 'citations', entry, id=entry['citation_text'])

    for citation in all_statutory_citations:
        entry = {'citation_text': '%d U.S.C. §%d'
                 % (citation[0], citation[1]), 'citation_type': 'statute'}
        es.index('docs_index', 'citations', entry, id=entry['citation_text'])

    logger.info("Citations loaded.")

    return citations
Пример #11
0
def restore_from_staging_index():
    """
    A 4-step process:
    1. Move the alias docs_search to point to `docs_staging` instead of `docs`.
    2. Reinitialize the index `docs`.
    3. Reindex `doc_staging` to `docs`
    4. Move `docs_index` and `docs_search` aliases to point to the `docs` index.
       Delete index `docs_staging`.
    """
    es = utils.get_elasticsearch_connection()

    logger.info("Move alias 'docs_search' to point to 'docs_staging'")
    es.indices.update_aliases(
        body={
            "actions": [
                {"remove": {"index": 'docs', "alias": 'docs_search'}},
                {"add": {"index": 'docs_staging', "alias": 'docs_search'}},
            ]
        }
    )

    logger.info("Delete and re-create index 'docs'")
    es.indices.delete('docs')
    es.indices.create('docs', {"mappings": MAPPINGS, "settings": ANALYZER_SETTINGS})

    logger.info("Reindex all documents from index 'docs_staging' to index 'docs'")

    body = {"source": {"index": "docs_staging", }, "dest": {"index": "docs"}}
    es.reindex(body=body, wait_for_completion=True, request_timeout=1500)

    move_aliases_to_docs_index()
Пример #12
0
def create_archived_murs_index():
    """
    Initialize Elasticsearch for storing archived MURs.
    If the `archived_murs` index already exists, delete it.
    Create the `archived_murs` index.
    Set up the alias `archived_murs_index` to point to the `archived_murs` index.
    Set up the alias `docs_search` to point `archived_murs` index, allowing the
    legal search to work across current and archived MURs
    """

    es = utils.get_elasticsearch_connection()

    try:
        logger.info("Delete index 'archived_murs'")
        es.indices.delete('archived_murs')
    except elasticsearch.exceptions.NotFoundError:
        pass

    logger.info(
        "Create index 'archived_murs' with aliases 'docs_search' and 'archived_murs_index'"
    )
    es.indices.create(
        'archived_murs', {
            "mappings": MAPPINGS,
            "settings": ANALYZER_SETTINGS,
            "aliases": {
                'archived_murs_index': {},
                'docs_search': {}
            }
        })
Пример #13
0
def delete_from_es(index, doc_type):
    """
    Deletes all documents with the given `doc_type` from Elasticsearch
    """
    es = utils.get_elasticsearch_connection()
    es.delete_by_query(
        index=index, body={'query': {'match_all': {}}}, doc_type=doc_type
    )
Пример #14
0
def process_mur(mur):
    logger.info("processing mur %d of %d" % (mur[0], mur[1]))
    es = utils.get_elasticsearch_connection()
    bucket = get_bucket()
    bucket_name = env.get_credential('bucket')
    mur_names = get_mur_names()
    (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\
        = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S)
    mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf",
                       mur_no_td).group(1)
    logger.info("processing mur %s" % mur_no)
    pdf_key = 'legal/murs/%s.pdf' % mur_no
    if [k for k in bucket.objects.filter(Prefix=pdf_key)]:
        logger.info('already processed %s' % pdf_key)
        return
    text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket)
    pdf_url = generate_aws_s3_url(bucket_name, pdf_key)
    open_date, close_date = (None, None)
    if open_date_td:
        open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat()
    if close_date_td:
        close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat()
    parties = re.findall("(.*?)<br>", parties_td)
    complainants = []
    respondents = []
    for party in parties:
        match = re.match("\(([RC])\) - (.*)", party)
        name = match.group(2).strip().title()
        if match.group(1) == 'C':
            complainants.append(name)
        if match.group(1) == 'R':
            respondents.append(name)

    subject = get_subject_tree(subject_td)
    citations = get_citations(re.findall("(.*?)<br>", citations_td))

    mur_digits = re.match("([0-9]+)", mur_no).group(1)
    name = mur_names[mur_digits] if mur_digits in mur_names else ''
    doc = {
        'doc_id': 'mur_%s' % mur_no,
        'no': mur_no,
        'name': name,
        'text': text,
        'mur_type': 'archived',
        'pdf_size': pdf_size,
        'pdf_pages': pdf_pages,
        'open_date': open_date,
        'close_date': close_date,
        'complainants': complainants,
        'respondents': respondents,
        'subject': subject,
        'citations': citations,
        'url': pdf_url
    }
    es.index(DOCS_INDEX, 'murs', doc, id=doc['doc_id'])
Пример #15
0
def remove_legal_docs():
    es = utils.get_elasticsearch_connection()
    es.delete_index('docs')
    es.create_index('docs', {"mappings": {
                             "_default_": {
                                "properties": {
                                        "no": {
                                            "type": "string",
                                            "index": "not_analyzed"
                                        }
                                    }
                                }}})
Пример #16
0
def delete_docs_index():
    """
    Delete index `docs`.
    This is usually done in preparation for restoring indexes from a snapshot backup.
    """

    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs'")
        es.indices.delete('docs')
    except elasticsearch.exceptions.NotFoundError:
        pass
Пример #17
0
def delete_docs_index():
    """
    Delete index `docs`.
    This is usually done in preparation for restoring indexes from a snapshot backup.
    """

    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs'")
        es.indices.delete('docs')
    except elasticsearch.exceptions.NotFoundError:
        pass
Пример #18
0
def get_most_recent_snapshot(repository_name=None):
    '''
    Get the list of snapshots (sorted by date, ascending) and
    return most recent snapshot name
    '''
    es = utils.get_elasticsearch_connection()

    repository_name = repository_name or BACKUP_REPOSITORY_NAME
    logger.info("Retreiving most recent snapshot")
    snapshot_list = es.snapshot.get(repository=repository_name,
                                    snapshot="*").get('snapshots')

    return snapshot_list.pop().get('snapshot')
Пример #19
0
def process_mur(mur):
    es = utils.get_elasticsearch_connection()
    bucket = get_bucket()
    mur_names = get_mur_names()
    (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\
        = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S)
    mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf",
                       mur_no_td).group(1)
    logger.info("Loading archived MUR %s: %s of %s", mur_no, mur[0] + 1,
                mur[1])
    pdf_key = 'legal/murs/%s.pdf' % mur_no
    text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket)
    pdf_url = '/files/' + pdf_key
    open_date, close_date = (None, None)
    if open_date_td:
        open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat()
    if close_date_td:
        close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat()
    parties = re.findall("(.*?)<br>", parties_td)
    complainants = []
    respondents = []
    for party in parties:
        match = re.match("\(([RC])\) - (.*)", party)
        name = match.group(2).strip().title()
        if match.group(1) == 'C':
            complainants.append(name)
        if match.group(1) == 'R':
            respondents.append(name)

    subject = get_subject_tree(subject_td)
    citations = get_citations(re.findall("(.*?)<br>", citations_td))

    mur_digits = re.match("([0-9]+)", mur_no).group(1)
    name = mur_names[mur_digits] if mur_digits in mur_names else ''
    doc = {
        'doc_id': 'mur_%s' % mur_no,
        'no': mur_no,
        'name': name,
        'text': text,
        'mur_type': 'archived',
        'pdf_size': pdf_size,
        'pdf_pages': pdf_pages,
        'open_date': open_date,
        'close_date': close_date,
        'complainants': complainants,
        'respondents': respondents,
        'subject': subject,
        'citations': citations,
        'url': pdf_url
    }
    es.index(DOCS_INDEX, 'murs', doc, id=doc['doc_id'])
def process_murs(raw_mur_tr_element_list):
    es = utils.get_elasticsearch_connection()
    bucket = get_bucket()
    mur_names = get_mur_names()

    for index, raw_mur_tr_element in enumerate(raw_mur_tr_element_list):
        (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\
            = re.findall("<td[^>]*>(.*?)</td>", raw_mur_tr_element, re.S)
        mur_no = re.search("/disclosure_data/mur/([0-9]+)(?:_[A-H])*\.pdf",
                           mur_no_td).group(1)

        logger.info("Loading archived MUR %s: %s of %s", mur_no, index + 1,
                    len(raw_mur_tr_element_list))

        open_date, close_date = (None, None)
        if open_date_td:
            open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat()
        if close_date_td:
            close_date = datetime.strptime(close_date_td,
                                           '%m/%d/%Y').isoformat()

        parties = re.findall("(.*?)<br>", parties_td)
        complainants = []
        respondents = []
        for party in parties:
            match = re.match("\(([RC])\) - (.*)", party)
            name = match.group(2).strip().title()
            if match.group(1) == 'C':
                complainants.append(name)
            if match.group(1) == 'R':
                respondents.append(name)

        mur_name = mur_names.get(mur_no, '')
        mur = {
            'doc_id': 'mur_%s' % mur_no,
            'no': mur_no,
            'name': mur_name,
            'mur_type': 'archived',
            'open_date': open_date,
            'close_date': close_date,
            'complainants': complainants,
            'respondents': respondents,
            'url': '/legal/matter-under-review/{0}/'.format(mur_no)
        }
        mur['subject'] = get_subject_tree(subject_td)
        mur['citations'] = get_citations(re.findall("(.*?)<br>", citations_td))
        mur['documents'] = get_documents(mur_no_td, bucket)

        es.index('archived_murs_index', 'murs', mur, id=mur['doc_id'])
Пример #21
0
def process_mur(mur):
    es = utils.get_elasticsearch_connection()
    bucket = get_bucket()
    mur_names = get_mur_names()
    (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\
        = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S)
    mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf", mur_no_td).group(1)
    logger.info("Loading archived MUR %s: %s of %s", mur_no, mur[0] + 1, mur[1])
    pdf_key = 'legal/murs/%s.pdf' % mur_no
    text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket)
    pdf_url = '/files/' + pdf_key
    open_date, close_date = (None, None)
    if open_date_td:
        open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat()
    if close_date_td:
        close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat()
    parties = re.findall("(.*?)<br>", parties_td)
    complainants = []
    respondents = []
    for party in parties:
        match = re.match("\(([RC])\) - (.*)", party)
        name = match.group(2).strip().title()
        if match.group(1) == 'C':
            complainants.append(name)
        if match.group(1) == 'R':
            respondents.append(name)

    subject = get_subject_tree(subject_td)
    citations = get_citations(re.findall("(.*?)<br>", citations_td))

    mur_digits = re.match("([0-9]+)", mur_no).group(1)
    name = mur_names[mur_digits] if mur_digits in mur_names else ''
    doc = {
        'doc_id': 'mur_%s' % mur_no,
        'no': mur_no,
        'name': name,
        'text': text,
        'mur_type': 'archived',
        'pdf_size': pdf_size,
        'pdf_pages': pdf_pages,
        'open_date': open_date,
        'close_date': close_date,
        'complainants': complainants,
        'respondents': respondents,
        'subject': subject,
        'citations': citations,
        'url': pdf_url
    }
    es.index('archived_murs_index', 'murs', doc, id=doc['doc_id'])
Пример #22
0
def load_current_murs(mur_no=None):
    """
    Reads data for current MURs from a Postgres database, assembles a JSON document
    corresponding to the MUR and indexes this document in Elasticsearch in the index
    `docs_index` with a doc_type of `murs`. In addition, all documents attached to
    the MUR are uploaded to an S3 bucket under the _directory_ `legal/murs/current/`.
    """
    es = get_elasticsearch_connection()
    logger.info("Loading current MUR(s)")
    mur_count = 0
    for mur in get_murs(mur_no):
        logger.info("Loading current MUR: %s", mur['no'])
        es.index('docs_index', 'murs', mur, id=mur['doc_id'])
        mur_count += 1
    logger.info("%d current MUR(s) loaded", mur_count)
Пример #23
0
def load_advisory_opinions(from_ao_no=None):
    """
    Reads data for advisory opinions from a Postgres database, assembles a JSON document
    corresponding to the advisory opinion and indexes this document in Elasticsearch in
    the index `docs_index` with a doc_type of `advisory_opinions`. In addition, all documents
    attached to the advisory opinion are uploaded to an S3 bucket under the _directory_
    `legal/aos/`.
    """
    es = get_elasticsearch_connection()

    logger.info("Loading advisory opinions")
    ao_count = 0
    for ao in get_advisory_opinions(from_ao_no):
        logger.info("Loading AO: %s", ao['no'])
        es.index(DOCS_INDEX, 'advisory_opinions', ao, id=ao['no'])
        ao_count += 1
    logger.info("%d advisory opinions loaded", ao_count)
Пример #24
0
def restore_from_staging_index():
    """
    A 4-step process:
    1. Move the alias docs_search to point to `docs_staging` instead of `docs`.
    2. Reinitialize the index `docs`.
    3. Reindex `doc_staging` to `docs`
    4. Move `docs_index` and `docs_search` aliases to point to the `docs` index.
       Delete index `docs_staging`.
    """
    es = utils.get_elasticsearch_connection()

    logger.info("Move alias 'docs_search' to point to 'docs_staging'")
    es.indices.update_aliases(body={"actions": [
        {"remove": {"index": 'docs', "alias": 'docs_search'}},
        {"add": {"index": 'docs_staging', "alias": 'docs_search'}}
    ]})

    logger.info("Delete and re-create index 'docs'")
    es.indices.delete('docs')
    es.indices.create('docs', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS
    })

    logger.info("Reindex all documents from index 'docs_staging' to index 'docs'")

    body = {
      "source": {
        "index": "docs_staging",
      },
      "dest": {
        "index": "docs"
      }
    }
    es.reindex(body=body, wait_for_completion=True, request_timeout=1500)

    logger.info("Move aliases 'docs_index' and 'docs_search' to point to 'docs'")
    es.indices.update_aliases(body={"actions": [
        {"remove": {"index": 'docs_staging', "alias": 'docs_index'}},
        {"remove": {"index": 'docs_staging', "alias": 'docs_search'}},
        {"add": {"index": 'docs', "alias": 'docs_index'}},
        {"add": {"index": 'docs', "alias": 'docs_search'}}
    ]})
    logger.info("Delete index 'docs_staging'")
    es.indices.delete('docs_staging')
Пример #25
0
def configure_backup_repository(repository=BACKUP_REPOSITORY_NAME):
    '''
    Configure s3 backup repository using api credentials.
    This needs to get re-run when s3 credentials change for each API deployment
    '''
    es = utils.get_elasticsearch_connection()
    logger.info("Configuring backup repository: {0}".format(repository))
    body = {
        'type': 's3',
        'settings': {
            'bucket': env.get_credential("bucket"),
            'region': env.get_credential("region"),
            'access_key': env.get_credential("access_key_id"),
            'secret_key': env.get_credential("secret_access_key"),
            'base_path': BACKUP_DIRECTORY,
        },
    }
    es.snapshot.create_repository(repository=repository, body=body)
Пример #26
0
def remap_archived_murs_citations():
    """Re-map citations for archived MURs. To extract the MUR
    information from the archived PDFs, use load_archived_murs"""

    es = utils.get_elasticsearch_connection()

    # Fetch archived murs from ES
    query = Search() \
            .query(Q('term', mur_type='archived') &  Q('term', _type='murs')) \
            .source(include='citations')
    archived_murs = elasticsearch.helpers.scan(es, query.to_dict(), scroll='1m', index='docs', doc_type='murs', size=500)

    # Re-map the citations
    update_murs = (dict(_op_type='update', _id=mur.meta.id, doc=mur.to_dict()) for mur in remap_citations(archived_murs))

    # Save MURs to ES
    count, _ = elasticsearch.helpers.bulk(es, update_murs, index='docs', doc_type='murs', chunk_size=100, request_timeout=30)
    logger.info("Re-mapped %d archived MURs" % count)
Пример #27
0
def create_elasticsearch_backup(repository_name=None, snapshot_name="auto_backup"):
    '''
    Create elasticsearch shapshot in the `legal_s3_repository` or specified repository.
    '''
    es = utils.get_elasticsearch_connection()

    repository_name = repository_name or BACKUP_REPOSITORY_NAME
    configure_backup_repository(repository_name)

    snapshot_name = "{0}_{1}".format(
        datetime.datetime.today().strftime('%Y%m%d'), snapshot_name
    )
    logger.info("Creating snapshot {0}".format(snapshot_name))
    result = es.snapshot.create(repository=repository_name, snapshot=snapshot_name)
    if result.get('accepted'):
        logger.info("Successfully created snapshot: {0}".format(snapshot_name))
    else:
        logger.error("Unable to create snapshot: {0}".format(snapshot_name))
Пример #28
0
def load_advisory_opinions(from_ao_no=None):
    """
    Reads data for advisory opinions from a Postgres database,
    assembles a JSON document corresponding to the advisory opinion
    and indexes this document in Elasticsearch in the index `docs_index`
    with a doc_type of `advisory_opinions`.
    In addition, all documents attached to the advisory opinion
    are uploaded to an S3 bucket under the _directory_`legal/aos/`.
    """
    es = get_elasticsearch_connection()

    logger.info("Loading advisory opinions")
    ao_count = 0
    for ao in get_advisory_opinions(from_ao_no):
        logger.info("Loading AO: %s", ao['no'])
        es.index('docs_index', 'advisory_opinions', ao, id=ao['no'])
        ao_count += 1
    logger.info("%d advisory opinions loaded", ao_count)
Пример #29
0
def index_regulations():
    """
        Indexes the regulations relevant to the FEC in Elasticsearch.
        The regulations are accessed from FEC_EREGS_API.
    """
    eregs_api = env.get_credential('FEC_EREGS_API', '')
    if not eregs_api:
        logger.error(
            "Regs could not be indexed, environment variable FEC_EREGS_API not set."
        )
        return

    logger.info("Indexing regulations")
    reg_versions = requests.get(eregs_api + 'regulation').json()['versions']
    es = utils.get_elasticsearch_connection()
    reg_count = 0
    for reg in reg_versions:
        url = '%sregulation/%s/%s' % (eregs_api, reg['regulation'],
                                      reg['version'])
        regulation = requests.get(url).json()
        sections = get_sections(regulation)

        logger.debug("Loading part %s" % reg['regulation'])
        for section_label in sections:
            doc_id = '%s_%s' % (section_label[0], section_label[1])
            section_formatted = '%s-%s' % (section_label[0], section_label[1])
            reg_url = '/regulations/{0}/{1}#{0}'.format(
                section_formatted, reg['version'])
            no = '%s.%s' % (section_label[0], section_label[1])
            name = sections[section_label]['title'].split(no)[1].strip()
            doc = {
                "doc_id": doc_id,
                "name": name,
                "text": sections[section_label]['text'],
                "url": reg_url,
                "no": no,
                "sort1": int(section_label[0]),
                "sort2": int(section_label[1])
            }

            es.index(DOCS_INDEX, 'regulations', doc, id=doc['doc_id'])
        reg_count += 1
    logger.info("%d regulation parts indexed", reg_count)
Пример #30
0
def get_title_52_statutes():
    es = utils.get_elasticsearch_connection()

    title_parsed = get_xml_tree_from_url(
        'http://uscode.house.gov/download/' +
        'releasepoints/us/pl/114/219/[email protected]')
    tag_name = '{{http://xml.house.gov/schemas/uslm/1.0}}{0}'
    section_count = 0
    for subtitle in title_parsed.iter(tag_name.format('subtitle')):
        if subtitle.attrib['identifier'] == '/us/usc/t52/stIII':
            for subchapter in subtitle.iter(tag_name.format('subchapter')):
                match = re.match("/us/usc/t52/stIII/ch([0-9]+)/sch([IVX]+)",
                                 subchapter.attrib['identifier'])
                chapter = match.group(1)
                subchapter_no = match.group(2)
                for section in subchapter.iter(tag_name.format('section')):
                    text = ''
                    for child in section.iter():
                        if child.text:
                            text += ' %s ' % child.text.strip()
                    heading = section.find(
                        tag_name.format('heading')).text.strip()
                    section_no = re.match(
                        '/us/usc/t52/s([0-9]+)',
                        section.attrib['identifier']).group(1)
                    pdf_url = 'http://api.fdsys.gov/link?collection=uscode&' +\
                              'title=52&year=mostrecent&section=%s'\
                              % section_no
                    doc = {
                        "doc_id": section.attrib['identifier'],
                        "text": text,
                        "name": heading,
                        "no": section_no,
                        "title": "52",
                        "chapter": chapter,
                        "subchapter": subchapter_no,
                        "url": pdf_url,
                        "sort1": 52,
                        "sort2": int(section_no)
                    }
                    es.index(DOCS_INDEX, 'statutes', doc, id=doc['doc_id'])
                    section_count += 1
    return section_count
Пример #31
0
def index_advisory_opinions():
    print('Indexing advisory opinions...')

    if legal_loaded():
        count = db.engine.execute('select count(*) from AO').fetchone()[0]
        print('AO count: %d' % count)
        count = db.engine.execute(
            'select count(*) from DOCUMENT').fetchone()[0]
        print('DOC count: %d' % count)

        es = utils.get_elasticsearch_connection()

        result = db.engine.execute("""select DOCUMENT_ID, OCRTEXT, DESCRIPTION,
                                CATEGORY, DOCUMENT.AO_ID, NAME, SUMMARY,
                                TAGS, AO_NO, DOCUMENT_DATE FROM DOCUMENT INNER JOIN
                                AO on AO.AO_ID = DOCUMENT.AO_ID""")

        docs_loaded = 0
        bucket_name = env.get_credential('bucket')
        for row in result:
            key = "legal/aos/%s.pdf" % row[0]
            pdf_url = "https://%s.s3.amazonaws.com/%s" % (bucket_name, key)
            doc = {
                "doc_id": row[0],
                "text": row[1],
                "description": row[2],
                "category": row[3],
                "id": row[4],
                "name": row[5],
                "summary": row[6],
                "tags": row[7],
                "no": row[8],
                "date": row[9],
                "url": pdf_url
            }

            es.index('docs', 'advisory_opinions', doc, id=doc['doc_id'])
            docs_loaded += 1

            if docs_loaded % 500 == 0:
                print("%d docs loaded" % docs_loaded)
        print("%d docs loaded" % docs_loaded)
Пример #32
0
def index_regulations():
    """
        Indexes the regulations relevant to the FEC in Elasticsearch.
        The regulations are accessed from FEC_EREGS_API.
    """
    eregs_api = env.get_credential('FEC_EREGS_API', '')
    if not eregs_api:
        logger.error("Regs could not be indexed, environment variable FEC_EREGS_API not set.")
        return

    logger.info("Indexing regulations")
    reg_versions = requests.get(eregs_api + 'regulation').json()['versions']
    es = utils.get_elasticsearch_connection()
    reg_count = 0
    for reg in reg_versions:
        url = '%sregulation/%s/%s' % (eregs_api, reg['regulation'],
                                        reg['version'])
        regulation = requests.get(url).json()
        sections = get_sections(regulation)

        logger.debug("Loading part %s" % reg['regulation'])
        for section_label in sections:
            doc_id = '%s_%s' % (section_label[0], section_label[1])
            section_formatted = '%s-%s' % (section_label[0], section_label[1])
            reg_url = '/regulations/{0}/{1}#{0}'.format(section_formatted,
                                                        reg['version'])
            no = '%s.%s' % (section_label[0], section_label[1])
            name = sections[section_label]['title'].split(no)[1].strip()
            doc = {
                "doc_id": doc_id,
                "name": name,
                "text": sections[section_label]['text'],
                "url": reg_url,
                "no": no,
                "sort1": int(section_label[0]),
                "sort2": int(section_label[1])
            }

            es.index('docs_index', 'regulations', doc, id=doc['doc_id'])
        reg_count += 1
    logger.info("%d regulation parts indexed", reg_count)
def load_cases(case_type, case_no=None):
    """
    Reads data for current MURs, AFs, and ADRs from a Postgres database,
    assembles a JSON document corresponding to the case, and indexes this document
    in Elasticsearch in the index `docs_index` with a doc_type of `murs`, `adrs`, or `admin_fines`.
    In addition, all documents attached to the case are uploaded to an
    S3 bucket under the _directory_ `legal/<doc_type>/<id>/`.
    """
    if case_type in ('MUR', 'ADR', 'AF'):
        es = get_elasticsearch_connection()
        logger.info("Loading {0}(s)".format(case_type))
        case_count = 0
        for case in get_cases(case_type, case_no):
            if case is not None:
                logger.info("Loading {0}: {1}".format(case_type, case['no']))
                es.index('docs_index',
                         get_es_type(case_type),
                         case,
                         id=case['doc_id'])
                case_count += 1
        logger.info("{0} {1}(s) loaded".format(case_count, case_type))
Пример #34
0
def load_archived_murs(mur_no=None):
    """
    Reads data for Archived MURs from a Postgres database (under schema:mur_arch),
    assembles a JSON document corresponding to the mur, and indexes this document
    in Elasticsearch in the index `archived_murs` with a doc_type of `murs`.
    """
    es = utils.get_elasticsearch_connection()
    mur_count = 0
    for mur in get_murs(mur_no):
        if mur is not None:
            logger.info("Loading archived MUR No: {0}".format(mur["no"]))
            es.index("archived_murs", get_es_type(), mur, id=mur["doc_id"])
            mur_count += 1

            logger.info("{0} Archived Mur(s) loaded".format(mur_count))
        else:
            logger.error("Invalid archived MUR")

        # ==for dubug use, display the JSON format of object "mur"
        logger.debug("mur_json_data =" +
                     json.dumps(mur, indent=4, cls=DateTimeEncoder))
Пример #35
0
def move_aliases_to_docs_index():
    """
    Move `docs_index` and `docs_search` aliases to point to the `docs` index.
    Delete index `docs_staging`.
    """

    es = utils.get_elasticsearch_connection()

    logger.info("Move aliases 'docs_index' and 'docs_search' to point to 'docs'")
    es.indices.update_aliases(
        body={
            "actions": [
                {"remove": {"index": 'docs_staging', "alias": 'docs_index'}},
                {"remove": {"index": 'docs_staging', "alias": 'docs_search'}},
                {"add": {"index": 'docs', "alias": 'docs_index'}},
                {"add": {"index": 'docs', "alias": 'docs_search'}},
            ]
        }
    )
    logger.info("Delete index 'docs_staging'")
    es.indices.delete('docs_staging')
Пример #36
0
def get_title_52_statutes():
    es = utils.get_elasticsearch_connection()

    title_parsed = get_xml_tree_from_url('http://uscode.house.gov/download/' +
                    'releasepoints/us/pl/114/219/[email protected]')
    tag_name = '{{http://xml.house.gov/schemas/uslm/1.0}}{0}'
    section_count = 0
    for subtitle in title_parsed.iter(tag_name.format('subtitle')):
        if subtitle.attrib['identifier'] == '/us/usc/t52/stIII':
            for subchapter in subtitle.iter(tag_name.format('subchapter')):
                match = re.match("/us/usc/t52/stIII/ch([0-9]+)/sch([IVX]+)",
                                    subchapter.attrib['identifier'])
                chapter = match.group(1)
                subchapter_no = match.group(2)
                for section in subchapter.iter(tag_name.format('section')):
                    text = ''
                    for child in section.iter():
                        if child.text:
                            text += ' %s ' % child.text.strip()
                    heading = section.find(tag_name.format('heading')).text.strip()
                    section_no = re.match('/us/usc/t52/s([0-9]+)',
                             section.attrib['identifier']).group(1)
                    pdf_url = 'http://api.fdsys.gov/link?collection=uscode&' +\
                              'title=52&year=mostrecent&section=%s'\
                              % section_no
                    doc = {
                        "doc_id": section.attrib['identifier'],
                        "text": text,
                        "name": heading,
                        "no": section_no,
                        "title": "52",
                        "chapter": chapter,
                        "subchapter": subchapter_no,
                        "url": pdf_url,
                        "sort1": 52,
                        "sort2": int(section_no)
                    }
                    es.index('docs_index', 'statutes', doc, id=doc['doc_id'])
                    section_count += 1
    return section_count
Пример #37
0
def restore_elasticsearch_backup(repository_name=None, snapshot_name=None):
    '''
    Restore elasticsearch from backup in the event of catastrophic failure at the infrastructure layer or user error.

    -Delete docs index
    -Restore from elasticsearch snapshot
    -Default to most recent snapshot, optionally specify `snapshot_name`
    '''
    es = utils.get_elasticsearch_connection()

    repository_name = repository_name or BACKUP_REPOSITORY_NAME
    configure_backup_repository(repository_name)

    most_recent_snapshot_name = get_most_recent_snapshot(repository_name)
    snapshot_name = snapshot_name or most_recent_snapshot_name

    if es.indices.exists('docs'):
        logger.info(
            'Found docs index. Creating staging index for zero-downtime restore'
        )
        create_staging_index()

    delete_all_indices()

    logger.info("Retrieving snapshot: {0}".format(snapshot_name))
    body = {"indices": "docs,archived_murs"}
    result = es.snapshot.restore(
        repository=BACKUP_REPOSITORY_NAME, snapshot=snapshot_name, body=body
    )
    if result.get('accepted'):
        logger.info("Successfully restored snapshot: {0}".format(snapshot_name))
        if es.indices.exists('docs_staging'):
            move_aliases_to_docs_index()
    else:
        logger.error("Unable to restore snapshot: {0}".format(snapshot_name))
        logger.info(
            "You may want to try the most recent snapshot: {0}".format(
                most_recent_snapshot_name
            )
        )
Пример #38
0
def load_cases(case_type, case_no=None):
    """
    Reads data for current MURs, AFs, and ADRs from a Postgres database,
    assembles a JSON document corresponding to the case, and indexes this document
    in Elasticsearch in the index `docs_index` with a doc_type of `murs`, `adrs`, or `admin_fines`.
    In addition, all documents attached to the case are uploaded to an
    S3 bucket under the _directory_ `legal/<doc_type>/<id>/`.
    """
    if case_type in ('MUR', 'ADR', 'AF'):
        es = get_elasticsearch_connection()
        logger.info("Loading {0}(s)".format(case_type))
        case_count = 0
        for case in get_cases(case_type, case_no):
            if case is not None:
                if case.get('published_flg'):
                    logger.info("Loading {0}: {1}".format(
                        case_type, case['no']))
                    es.index('docs_index',
                             get_es_type(case_type),
                             case,
                             id=case['doc_id'])
                    case_count += 1
                    logger.info("{0} {1}(s) loaded".format(
                        case_count, case_type))
                else:
                    logger.info(
                        "Found an unpublished case - deleting {0}: {1} from ES"
                        .format(case_type, case['no']))
                    es.delete_by_query(
                        index='docs_index',
                        body={'query': {
                            "term": {
                                "no": case['no']
                            }
                        }},
                        doc_type=get_es_type(case_type))
                    logger.info('Successfully deleted {} {} from ES'.format(
                        case_type, case['no']))
Пример #39
0
def create_staging_index():
    """
    Create the index `docs_staging`.
    Move the alias docs_index to point to `docs_staging` instead of `docs`.
    """
    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs_staging'")
        es.indices.delete('docs_staging')
    except:
        pass

    logger.info("Create index 'docs_staging'")
    es.indices.create('docs_staging', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS,
    })

    logger.info("Move alias 'docs_index' to point to 'docs_staging'")
    es.indices.update_aliases(body={"actions": [
        {"remove": {"index": 'docs', "alias": 'docs_index'}},
        {"add": {"index": 'docs_staging', "alias": 'docs_index'}}
    ]})
Пример #40
0
import re

from elasticsearch_dsl import Search, Q
from webargs import fields
from flask import abort

from webservices import args
from webservices import utils
from webservices.utils import use_kwargs
from elasticsearch import RequestError
from webservices.exceptions import ApiError
import logging


es = utils.get_elasticsearch_connection()
logger = logging.getLogger(__name__)

INNER_HITS = {
    "_source": False,
    "highlight": {
        "require_field_match": False,
        "fields": {
            "documents.text": {},
            "documents.description": {}
        }
    }
}


class GetLegalCitation(utils.Resource):
    @property
Пример #41
0
import re

from elasticsearch_dsl import Search, Q
from webargs import fields
from flask import abort

from webservices import args
from webservices import utils
from webservices.utils import use_kwargs
from webservices.legal_docs import DOCS_SEARCH
from elasticsearch import RequestError
from webservices.exceptions import ApiError
import logging


es = utils.get_elasticsearch_connection()
logger = logging.getLogger(__name__)

INNER_HITS = {
    "_source": False,
    "highlight": {
        "require_field_match": False,
        "fields": {
            "documents.text": {},
            "documents.description": {}
        }
    }
}


class GetLegalCitation(utils.Resource):
Пример #42
0
def delete_from_es(index, doc_type):
    """
    Deletes all documents with the given `doc_type` from Elasticsearch
    """
    es = utils.get_elasticsearch_connection()
    es.delete_by_query(index=index, body={'query': {'match_all': {}}}, doc_type=doc_type)
Пример #43
0
def get_citations(ao_names):
    ao_component_to_name_map = {
        tuple(map(int, a.split('-'))): a
        for a in ao_names
    }

    logger.info("Getting citations...")

    rs = db.engine.execute("""SELECT ao_no, ocrtext FROM aouser.document
                                INNER JOIN aouser.ao USING (ao_id)
                              WHERE category = 'Final Opinion'""")

    all_regulatory_citations = set()
    all_statutory_citations = set()
    raw_citations = defaultdict(lambda: defaultdict(set))
    for row in rs:
        logger.debug("Getting citations for AO %s" % row["ao_no"])

        ao_citations_in_doc = parse_ao_citations(row["ocrtext"],
                                                 ao_component_to_name_map)
        ao_citations_in_doc.discard(row["ao_no"])  # Remove self

        raw_citations[row["ao_no"]]["ao"].update(ao_citations_in_doc)

        for citation in ao_citations_in_doc:
            raw_citations[citation]["aos_cited_by"].add(row["ao_no"])

        statutory_citations = parse_statutory_citations(row["ocrtext"])
        regulatory_citations = parse_regulatory_citations(row["ocrtext"])
        all_statutory_citations.update(statutory_citations)
        all_regulatory_citations.update(regulatory_citations)
        raw_citations[row["ao_no"]]["statutes"].update(statutory_citations)
        raw_citations[row["ao_no"]]["regulations"].update(regulatory_citations)

    citations = defaultdict(lambda: defaultdict(list))
    for ao in raw_citations:
        citations[ao]["ao"] = sorted([{
            "no": c,
            "name": ao_names[c]
        } for c in raw_citations[ao]["ao"]],
                                     key=lambda d: d["no"])
        citations[ao]["aos_cited_by"] = sorted([{
            "no": c,
            "name": ao_names[c]
        } for c in raw_citations[ao]["aos_cited_by"]],
                                               key=lambda d: d["no"])
        citations[ao]["statutes"] = sorted([{
            "title": c[0],
            "section": c[1]
        } for c in raw_citations[ao]["statutes"]],
                                           key=lambda d:
                                           (d["title"], d["section"]))
        citations[ao]["regulations"] = sorted(
            [{
                "title": c[0],
                "part": c[1],
                "section": c[2]
            } for c in raw_citations[ao]["regulations"]],
            key=lambda d: (d["title"], d["part"], d["section"]))

    es = get_elasticsearch_connection()

    for citation in all_regulatory_citations:
        entry = {
            'citation_text':
            '%d CFR §%d.%d' % (citation[0], citation[1], citation[2]),
            'citation_type':
            'regulation'
        }
        es.index('docs_index', 'citations', entry, id=entry['citation_text'])

    for citation in all_statutory_citations:
        entry = {
            'citation_text': '%d U.S.C. §%d' % (citation[0], citation[1]),
            'citation_type': 'statute'
        }
        es.index('docs_index', 'citations', entry, id=entry['citation_text'])

    logger.info("Citations loaded.")

    return citations