def create_docs_index(): """ Initialize Elasticsearch for storing legal documents. Create the `docs` index, and set up the aliases `docs_index` and `docs_search` to point to the `docs` index. If the `doc` index already exists, delete it. """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'docs'") es.indices.delete('docs') except elasticsearch.exceptions.NotFoundError: pass try: logger.info("Delete index 'docs_index'") es.indices.delete('docs_index') except elasticsearch.exceptions.NotFoundError: pass logger.info("Create index 'docs'") es.indices.create( 'docs', { "mappings": MAPPINGS, "settings": ANALYZER_SETTINGS, "aliases": { 'docs_index': {}, 'docs_search': {} } })
def move_archived_murs(): ''' Move archived MURs from `docs` index to `archived_murs_index` This should only need to be run once. Once archived MURs are on their own index, we will be able to re-index current legal docs after a schema change much more quickly. ''' es = utils.get_elasticsearch_connection() body = { "source": { "index": "docs", "type": "murs", "query": { "match": { "mur_type": "archived" } } }, "dest": { "index": "archived_murs" } } logger.info("Copy archived MURs from 'docs' index to 'archived_murs' index") es.reindex(body=body, wait_for_completion=True, request_timeout=1500)
def get_title_26_statutes(): es = utils.get_elasticsearch_connection() title_parsed = get_xml_tree_from_url('http://uscode.house.gov/download/' + 'releasepoints/us/pl/114/219/[email protected]') tag_name = '{{http://xml.house.gov/schemas/uslm/1.0}}{0}' for subtitle in title_parsed.iter(tag_name.format('subtitle')): if subtitle.attrib['identifier'] == '/us/usc/t26/stH': for chapter in subtitle.iter(tag_name.format('chapter')): match = re.match("/us/usc/t26/stH/ch([0-9]+)", chapter.attrib['identifier']) chapter_no = match.group(1) for section in chapter.iter(tag_name.format('section')): text = '' for child in section.iter(): if child.text: text += ' %s ' % child.text.strip() heading = section.find(tag_name.format('heading')).text.strip() section_no = re.match('/us/usc/t26/s([0-9]+)', section.attrib['identifier']).group(1) pdf_url = 'http://api.fdsys.gov/link?collection=uscode&' +\ 'title=26&year=mostrecent§ion=%s'\ % section_no doc = {"doc_id": section.attrib['identifier'], "text": text, "name": heading, "no": section_no, "title": "26", "chapter": chapter_no, "url": pdf_url} es.index('docs', 'statutes', doc, id=doc['doc_id'])
def load_current_murs(): es = get_elasticsearch_connection() bucket = get_bucket() bucket_name = env.get_credential('bucket') with db.engine.connect() as conn: rs = conn.execute(ALL_MURS) for row in rs: case_id = row['case_id'] mur = { 'doc_id': 'mur_%s' % row['case_no'], 'no': row['case_no'], 'name': row['name'], 'mur_type': 'current', } mur['subject'] = {"text": get_subjects(case_id)} participants = get_participants(case_id) mur['participants'] = list(participants.values()) mur['disposition'] = get_disposition(case_id) mur['text'], mur['documents'] = get_documents( case_id, bucket, bucket_name) mur['open_date'], mur['close_date'] = get_open_and_close_dates( case_id) mur['url'] = '/legal/matter-under-review/%s/' % row['case_no'] es.index('docs', 'murs', mur, id=mur['doc_id'])
def create_docs_index(): """ Initialize Elasticsearch for storing legal documents. Create the `docs` index, and set up the aliases `docs_index` and `docs_search` to point to the `docs` index. If the `doc` index already exists, delete it. """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'docs'") es.indices.delete('docs') except elasticsearch.exceptions.NotFoundError: pass try: logger.info("Delete index 'docs_index'") es.indices.delete('docs_index') except elasticsearch.exceptions.NotFoundError: pass logger.info("Create index 'docs'") es.indices.create('docs', { "mappings": MAPPINGS, "settings": ANALYZER_SETTINGS, "aliases": { 'docs_index': {}, 'docs_search': {} } })
def create_archived_murs_index(): """ Initialize Elasticsearch for storing archived MURs. If the `archived_murs` index already exists, delete it. Create the `archived_murs` index. Set up the alias `archived_murs_index` to point to the `archived_murs` index. Set up the alias `docs_search` to point `archived_murs` index, allowing the legal search to work across current and archived MURs """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'archived_murs'") es.indices.delete('archived_murs') except elasticsearch.exceptions.NotFoundError: pass logger.info("Create index 'archived_murs' with aliases 'docs_search' and 'archived_murs_index'") es.indices.create('archived_murs', { "mappings": MAPPINGS, "settings": ANALYZER_SETTINGS, "aliases": { 'archived_murs_index': {}, 'docs_search': {} } })
def create_staging_index(): """ Create the index `docs_staging`. Move the alias docs_index to point to `docs_staging` instead of `docs`. """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'docs_staging'") es.indices.delete('docs_staging') except: pass logger.info("Create index 'docs_staging'") es.indices.create('docs_staging', { "mappings": MAPPINGS, "settings": ANALYZER_SETTINGS, }) logger.info("Move alias 'docs_index' to point to 'docs_staging'") es.indices.update_aliases( body={ "actions": [{ "remove": { "index": 'docs', "alias": 'docs_index' } }, { "add": { "index": 'docs_staging', "alias": 'docs_index' } }] })
def index_regulations(): eregs_api = env.get_credential('FEC_EREGS_API', '') if(eregs_api): reg_versions = requests.get(eregs_api + 'regulation').json()['versions'] es = utils.get_elasticsearch_connection() reg_count = 0 for reg in reg_versions: url = '%sregulation/%s/%s' % (eregs_api, reg['regulation'], reg['version']) regulation = requests.get(url).json() sections = get_sections(regulation) print("Loading part %s" % reg['regulation']) for section_label in sections: doc_id = '%s_%s' % (section_label[0], section_label[1]) section_formatted = '%s-%s' % (section_label[0], section_label[1]) reg_url = '/regulations/{0}/{1}#{0}'.format(section_formatted, reg['version']) no = '%s.%s' % (section_label[0], section_label[1]) name = sections[section_label]['title'].split(no)[1].strip() doc = {"doc_id": doc_id, "name": name, "text": sections[section_label]['text'], 'url': reg_url, "no": no} es.index('docs', 'regulations', doc, id=doc['doc_id']) reg_count += 1 print("%d regulation parts indexed." % reg_count) else: print("Regs could not be indexed, environment variable not set.")
def move_archived_murs(): ''' Move archived MURs from `docs` index to `archived_murs_index` This should only need to be run once. Once archived MURs are on their own index, we will be able to re-index current legal docs after a schema change much more quickly. ''' es = utils.get_elasticsearch_connection() body = { "source": { "index": "docs", "type": "murs", "query": { "match": { "mur_type": "archived" } } }, "dest": { "index": "archived_murs" } } logger.info( "Copy archived MURs from 'docs' index to 'archived_murs' index") es.reindex(body=body, wait_for_completion=True, request_timeout=1500)
def get_citations(ao_names): ao_component_to_name_map = {tuple(map(int, a.split('-'))): a for a in ao_names} logger.info("Getting citations...") rs = db.engine.execute("""SELECT ao_no, ocrtext FROM aouser.document INNER JOIN aouser.ao USING (ao_id) WHERE category = 'Final Opinion'""") all_regulatory_citations = set() all_statutory_citations = set() raw_citations = defaultdict(lambda: defaultdict(set)) for row in rs: logger.debug("Getting citations for AO %s" % row["ao_no"]) ao_citations_in_doc = parse_ao_citations(row["ocrtext"], ao_component_to_name_map) ao_citations_in_doc.discard(row["ao_no"]) # Remove self raw_citations[row["ao_no"]]["ao"].update(ao_citations_in_doc) for citation in ao_citations_in_doc: raw_citations[citation]["aos_cited_by"].add(row["ao_no"]) statutory_citations = parse_statutory_citations(row["ocrtext"]) regulatory_citations = parse_regulatory_citations(row["ocrtext"]) all_statutory_citations.update(statutory_citations) all_regulatory_citations.update(regulatory_citations) raw_citations[row["ao_no"]]["statutes"].update(statutory_citations) raw_citations[row["ao_no"]]["regulations"].update(regulatory_citations) citations = defaultdict(lambda: defaultdict(list)) for ao in raw_citations: citations[ao]["ao"] = sorted([ {"no": c, "name": ao_names[c]} for c in raw_citations[ao]["ao"]], key=lambda d: d["no"]) citations[ao]["aos_cited_by"] = sorted([ {"no": c, "name": ao_names[c]} for c in raw_citations[ao]["aos_cited_by"]], key=lambda d: d["no"]) citations[ao]["statutes"] = sorted([ {"title": c[0], "section": c[1]} for c in raw_citations[ao]["statutes"]], key=lambda d: (d["title"], d["section"])) citations[ao]["regulations"] = sorted([ {"title": c[0], "part": c[1], "section": c[2]} for c in raw_citations[ao]["regulations"]], key=lambda d: (d["title"], d["part"], d["section"])) es = get_elasticsearch_connection() for citation in all_regulatory_citations: entry = {'citation_text': '%d CFR §%d.%d' % (citation[0], citation[1], citation[2]),'citation_type': 'regulation'} es.index('docs_index', 'citations', entry, id=entry['citation_text']) for citation in all_statutory_citations: entry = {'citation_text': '%d U.S.C. §%d' % (citation[0], citation[1]), 'citation_type': 'statute'} es.index('docs_index', 'citations', entry, id=entry['citation_text']) logger.info("Citations loaded.") return citations
def restore_from_staging_index(): """ A 4-step process: 1. Move the alias docs_search to point to `docs_staging` instead of `docs`. 2. Reinitialize the index `docs`. 3. Reindex `doc_staging` to `docs` 4. Move `docs_index` and `docs_search` aliases to point to the `docs` index. Delete index `docs_staging`. """ es = utils.get_elasticsearch_connection() logger.info("Move alias 'docs_search' to point to 'docs_staging'") es.indices.update_aliases( body={ "actions": [ {"remove": {"index": 'docs', "alias": 'docs_search'}}, {"add": {"index": 'docs_staging', "alias": 'docs_search'}}, ] } ) logger.info("Delete and re-create index 'docs'") es.indices.delete('docs') es.indices.create('docs', {"mappings": MAPPINGS, "settings": ANALYZER_SETTINGS}) logger.info("Reindex all documents from index 'docs_staging' to index 'docs'") body = {"source": {"index": "docs_staging", }, "dest": {"index": "docs"}} es.reindex(body=body, wait_for_completion=True, request_timeout=1500) move_aliases_to_docs_index()
def create_archived_murs_index(): """ Initialize Elasticsearch for storing archived MURs. If the `archived_murs` index already exists, delete it. Create the `archived_murs` index. Set up the alias `archived_murs_index` to point to the `archived_murs` index. Set up the alias `docs_search` to point `archived_murs` index, allowing the legal search to work across current and archived MURs """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'archived_murs'") es.indices.delete('archived_murs') except elasticsearch.exceptions.NotFoundError: pass logger.info( "Create index 'archived_murs' with aliases 'docs_search' and 'archived_murs_index'" ) es.indices.create( 'archived_murs', { "mappings": MAPPINGS, "settings": ANALYZER_SETTINGS, "aliases": { 'archived_murs_index': {}, 'docs_search': {} } })
def delete_from_es(index, doc_type): """ Deletes all documents with the given `doc_type` from Elasticsearch """ es = utils.get_elasticsearch_connection() es.delete_by_query( index=index, body={'query': {'match_all': {}}}, doc_type=doc_type )
def process_mur(mur): logger.info("processing mur %d of %d" % (mur[0], mur[1])) es = utils.get_elasticsearch_connection() bucket = get_bucket() bucket_name = env.get_credential('bucket') mur_names = get_mur_names() (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S) mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf", mur_no_td).group(1) logger.info("processing mur %s" % mur_no) pdf_key = 'legal/murs/%s.pdf' % mur_no if [k for k in bucket.objects.filter(Prefix=pdf_key)]: logger.info('already processed %s' % pdf_key) return text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket) pdf_url = generate_aws_s3_url(bucket_name, pdf_key) open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) subject = get_subject_tree(subject_td) citations = get_citations(re.findall("(.*?)<br>", citations_td)) mur_digits = re.match("([0-9]+)", mur_no).group(1) name = mur_names[mur_digits] if mur_digits in mur_names else '' doc = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': name, 'text': text, 'mur_type': 'archived', 'pdf_size': pdf_size, 'pdf_pages': pdf_pages, 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'subject': subject, 'citations': citations, 'url': pdf_url } es.index(DOCS_INDEX, 'murs', doc, id=doc['doc_id'])
def remove_legal_docs(): es = utils.get_elasticsearch_connection() es.delete_index('docs') es.create_index('docs', {"mappings": { "_default_": { "properties": { "no": { "type": "string", "index": "not_analyzed" } } }}})
def delete_docs_index(): """ Delete index `docs`. This is usually done in preparation for restoring indexes from a snapshot backup. """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'docs'") es.indices.delete('docs') except elasticsearch.exceptions.NotFoundError: pass
def delete_docs_index(): """ Delete index `docs`. This is usually done in preparation for restoring indexes from a snapshot backup. """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'docs'") es.indices.delete('docs') except elasticsearch.exceptions.NotFoundError: pass
def get_most_recent_snapshot(repository_name=None): ''' Get the list of snapshots (sorted by date, ascending) and return most recent snapshot name ''' es = utils.get_elasticsearch_connection() repository_name = repository_name or BACKUP_REPOSITORY_NAME logger.info("Retreiving most recent snapshot") snapshot_list = es.snapshot.get(repository=repository_name, snapshot="*").get('snapshots') return snapshot_list.pop().get('snapshot')
def process_mur(mur): es = utils.get_elasticsearch_connection() bucket = get_bucket() mur_names = get_mur_names() (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S) mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf", mur_no_td).group(1) logger.info("Loading archived MUR %s: %s of %s", mur_no, mur[0] + 1, mur[1]) pdf_key = 'legal/murs/%s.pdf' % mur_no text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket) pdf_url = '/files/' + pdf_key open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) subject = get_subject_tree(subject_td) citations = get_citations(re.findall("(.*?)<br>", citations_td)) mur_digits = re.match("([0-9]+)", mur_no).group(1) name = mur_names[mur_digits] if mur_digits in mur_names else '' doc = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': name, 'text': text, 'mur_type': 'archived', 'pdf_size': pdf_size, 'pdf_pages': pdf_pages, 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'subject': subject, 'citations': citations, 'url': pdf_url } es.index(DOCS_INDEX, 'murs', doc, id=doc['doc_id'])
def process_murs(raw_mur_tr_element_list): es = utils.get_elasticsearch_connection() bucket = get_bucket() mur_names = get_mur_names() for index, raw_mur_tr_element in enumerate(raw_mur_tr_element_list): (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", raw_mur_tr_element, re.S) mur_no = re.search("/disclosure_data/mur/([0-9]+)(?:_[A-H])*\.pdf", mur_no_td).group(1) logger.info("Loading archived MUR %s: %s of %s", mur_no, index + 1, len(raw_mur_tr_element_list)) open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) mur_name = mur_names.get(mur_no, '') mur = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': mur_name, 'mur_type': 'archived', 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'url': '/legal/matter-under-review/{0}/'.format(mur_no) } mur['subject'] = get_subject_tree(subject_td) mur['citations'] = get_citations(re.findall("(.*?)<br>", citations_td)) mur['documents'] = get_documents(mur_no_td, bucket) es.index('archived_murs_index', 'murs', mur, id=mur['doc_id'])
def process_mur(mur): es = utils.get_elasticsearch_connection() bucket = get_bucket() mur_names = get_mur_names() (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S) mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf", mur_no_td).group(1) logger.info("Loading archived MUR %s: %s of %s", mur_no, mur[0] + 1, mur[1]) pdf_key = 'legal/murs/%s.pdf' % mur_no text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket) pdf_url = '/files/' + pdf_key open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) subject = get_subject_tree(subject_td) citations = get_citations(re.findall("(.*?)<br>", citations_td)) mur_digits = re.match("([0-9]+)", mur_no).group(1) name = mur_names[mur_digits] if mur_digits in mur_names else '' doc = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': name, 'text': text, 'mur_type': 'archived', 'pdf_size': pdf_size, 'pdf_pages': pdf_pages, 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'subject': subject, 'citations': citations, 'url': pdf_url } es.index('archived_murs_index', 'murs', doc, id=doc['doc_id'])
def load_current_murs(mur_no=None): """ Reads data for current MURs from a Postgres database, assembles a JSON document corresponding to the MUR and indexes this document in Elasticsearch in the index `docs_index` with a doc_type of `murs`. In addition, all documents attached to the MUR are uploaded to an S3 bucket under the _directory_ `legal/murs/current/`. """ es = get_elasticsearch_connection() logger.info("Loading current MUR(s)") mur_count = 0 for mur in get_murs(mur_no): logger.info("Loading current MUR: %s", mur['no']) es.index('docs_index', 'murs', mur, id=mur['doc_id']) mur_count += 1 logger.info("%d current MUR(s) loaded", mur_count)
def load_advisory_opinions(from_ao_no=None): """ Reads data for advisory opinions from a Postgres database, assembles a JSON document corresponding to the advisory opinion and indexes this document in Elasticsearch in the index `docs_index` with a doc_type of `advisory_opinions`. In addition, all documents attached to the advisory opinion are uploaded to an S3 bucket under the _directory_ `legal/aos/`. """ es = get_elasticsearch_connection() logger.info("Loading advisory opinions") ao_count = 0 for ao in get_advisory_opinions(from_ao_no): logger.info("Loading AO: %s", ao['no']) es.index(DOCS_INDEX, 'advisory_opinions', ao, id=ao['no']) ao_count += 1 logger.info("%d advisory opinions loaded", ao_count)
def restore_from_staging_index(): """ A 4-step process: 1. Move the alias docs_search to point to `docs_staging` instead of `docs`. 2. Reinitialize the index `docs`. 3. Reindex `doc_staging` to `docs` 4. Move `docs_index` and `docs_search` aliases to point to the `docs` index. Delete index `docs_staging`. """ es = utils.get_elasticsearch_connection() logger.info("Move alias 'docs_search' to point to 'docs_staging'") es.indices.update_aliases(body={"actions": [ {"remove": {"index": 'docs', "alias": 'docs_search'}}, {"add": {"index": 'docs_staging', "alias": 'docs_search'}} ]}) logger.info("Delete and re-create index 'docs'") es.indices.delete('docs') es.indices.create('docs', { "mappings": MAPPINGS, "settings": ANALYZER_SETTINGS }) logger.info("Reindex all documents from index 'docs_staging' to index 'docs'") body = { "source": { "index": "docs_staging", }, "dest": { "index": "docs" } } es.reindex(body=body, wait_for_completion=True, request_timeout=1500) logger.info("Move aliases 'docs_index' and 'docs_search' to point to 'docs'") es.indices.update_aliases(body={"actions": [ {"remove": {"index": 'docs_staging', "alias": 'docs_index'}}, {"remove": {"index": 'docs_staging', "alias": 'docs_search'}}, {"add": {"index": 'docs', "alias": 'docs_index'}}, {"add": {"index": 'docs', "alias": 'docs_search'}} ]}) logger.info("Delete index 'docs_staging'") es.indices.delete('docs_staging')
def configure_backup_repository(repository=BACKUP_REPOSITORY_NAME): ''' Configure s3 backup repository using api credentials. This needs to get re-run when s3 credentials change for each API deployment ''' es = utils.get_elasticsearch_connection() logger.info("Configuring backup repository: {0}".format(repository)) body = { 'type': 's3', 'settings': { 'bucket': env.get_credential("bucket"), 'region': env.get_credential("region"), 'access_key': env.get_credential("access_key_id"), 'secret_key': env.get_credential("secret_access_key"), 'base_path': BACKUP_DIRECTORY, }, } es.snapshot.create_repository(repository=repository, body=body)
def remap_archived_murs_citations(): """Re-map citations for archived MURs. To extract the MUR information from the archived PDFs, use load_archived_murs""" es = utils.get_elasticsearch_connection() # Fetch archived murs from ES query = Search() \ .query(Q('term', mur_type='archived') & Q('term', _type='murs')) \ .source(include='citations') archived_murs = elasticsearch.helpers.scan(es, query.to_dict(), scroll='1m', index='docs', doc_type='murs', size=500) # Re-map the citations update_murs = (dict(_op_type='update', _id=mur.meta.id, doc=mur.to_dict()) for mur in remap_citations(archived_murs)) # Save MURs to ES count, _ = elasticsearch.helpers.bulk(es, update_murs, index='docs', doc_type='murs', chunk_size=100, request_timeout=30) logger.info("Re-mapped %d archived MURs" % count)
def create_elasticsearch_backup(repository_name=None, snapshot_name="auto_backup"): ''' Create elasticsearch shapshot in the `legal_s3_repository` or specified repository. ''' es = utils.get_elasticsearch_connection() repository_name = repository_name or BACKUP_REPOSITORY_NAME configure_backup_repository(repository_name) snapshot_name = "{0}_{1}".format( datetime.datetime.today().strftime('%Y%m%d'), snapshot_name ) logger.info("Creating snapshot {0}".format(snapshot_name)) result = es.snapshot.create(repository=repository_name, snapshot=snapshot_name) if result.get('accepted'): logger.info("Successfully created snapshot: {0}".format(snapshot_name)) else: logger.error("Unable to create snapshot: {0}".format(snapshot_name))
def load_advisory_opinions(from_ao_no=None): """ Reads data for advisory opinions from a Postgres database, assembles a JSON document corresponding to the advisory opinion and indexes this document in Elasticsearch in the index `docs_index` with a doc_type of `advisory_opinions`. In addition, all documents attached to the advisory opinion are uploaded to an S3 bucket under the _directory_`legal/aos/`. """ es = get_elasticsearch_connection() logger.info("Loading advisory opinions") ao_count = 0 for ao in get_advisory_opinions(from_ao_no): logger.info("Loading AO: %s", ao['no']) es.index('docs_index', 'advisory_opinions', ao, id=ao['no']) ao_count += 1 logger.info("%d advisory opinions loaded", ao_count)
def index_regulations(): """ Indexes the regulations relevant to the FEC in Elasticsearch. The regulations are accessed from FEC_EREGS_API. """ eregs_api = env.get_credential('FEC_EREGS_API', '') if not eregs_api: logger.error( "Regs could not be indexed, environment variable FEC_EREGS_API not set." ) return logger.info("Indexing regulations") reg_versions = requests.get(eregs_api + 'regulation').json()['versions'] es = utils.get_elasticsearch_connection() reg_count = 0 for reg in reg_versions: url = '%sregulation/%s/%s' % (eregs_api, reg['regulation'], reg['version']) regulation = requests.get(url).json() sections = get_sections(regulation) logger.debug("Loading part %s" % reg['regulation']) for section_label in sections: doc_id = '%s_%s' % (section_label[0], section_label[1]) section_formatted = '%s-%s' % (section_label[0], section_label[1]) reg_url = '/regulations/{0}/{1}#{0}'.format( section_formatted, reg['version']) no = '%s.%s' % (section_label[0], section_label[1]) name = sections[section_label]['title'].split(no)[1].strip() doc = { "doc_id": doc_id, "name": name, "text": sections[section_label]['text'], "url": reg_url, "no": no, "sort1": int(section_label[0]), "sort2": int(section_label[1]) } es.index(DOCS_INDEX, 'regulations', doc, id=doc['doc_id']) reg_count += 1 logger.info("%d regulation parts indexed", reg_count)
def get_title_52_statutes(): es = utils.get_elasticsearch_connection() title_parsed = get_xml_tree_from_url( 'http://uscode.house.gov/download/' + 'releasepoints/us/pl/114/219/[email protected]') tag_name = '{{http://xml.house.gov/schemas/uslm/1.0}}{0}' section_count = 0 for subtitle in title_parsed.iter(tag_name.format('subtitle')): if subtitle.attrib['identifier'] == '/us/usc/t52/stIII': for subchapter in subtitle.iter(tag_name.format('subchapter')): match = re.match("/us/usc/t52/stIII/ch([0-9]+)/sch([IVX]+)", subchapter.attrib['identifier']) chapter = match.group(1) subchapter_no = match.group(2) for section in subchapter.iter(tag_name.format('section')): text = '' for child in section.iter(): if child.text: text += ' %s ' % child.text.strip() heading = section.find( tag_name.format('heading')).text.strip() section_no = re.match( '/us/usc/t52/s([0-9]+)', section.attrib['identifier']).group(1) pdf_url = 'http://api.fdsys.gov/link?collection=uscode&' +\ 'title=52&year=mostrecent§ion=%s'\ % section_no doc = { "doc_id": section.attrib['identifier'], "text": text, "name": heading, "no": section_no, "title": "52", "chapter": chapter, "subchapter": subchapter_no, "url": pdf_url, "sort1": 52, "sort2": int(section_no) } es.index(DOCS_INDEX, 'statutes', doc, id=doc['doc_id']) section_count += 1 return section_count
def index_advisory_opinions(): print('Indexing advisory opinions...') if legal_loaded(): count = db.engine.execute('select count(*) from AO').fetchone()[0] print('AO count: %d' % count) count = db.engine.execute( 'select count(*) from DOCUMENT').fetchone()[0] print('DOC count: %d' % count) es = utils.get_elasticsearch_connection() result = db.engine.execute("""select DOCUMENT_ID, OCRTEXT, DESCRIPTION, CATEGORY, DOCUMENT.AO_ID, NAME, SUMMARY, TAGS, AO_NO, DOCUMENT_DATE FROM DOCUMENT INNER JOIN AO on AO.AO_ID = DOCUMENT.AO_ID""") docs_loaded = 0 bucket_name = env.get_credential('bucket') for row in result: key = "legal/aos/%s.pdf" % row[0] pdf_url = "https://%s.s3.amazonaws.com/%s" % (bucket_name, key) doc = { "doc_id": row[0], "text": row[1], "description": row[2], "category": row[3], "id": row[4], "name": row[5], "summary": row[6], "tags": row[7], "no": row[8], "date": row[9], "url": pdf_url } es.index('docs', 'advisory_opinions', doc, id=doc['doc_id']) docs_loaded += 1 if docs_loaded % 500 == 0: print("%d docs loaded" % docs_loaded) print("%d docs loaded" % docs_loaded)
def index_regulations(): """ Indexes the regulations relevant to the FEC in Elasticsearch. The regulations are accessed from FEC_EREGS_API. """ eregs_api = env.get_credential('FEC_EREGS_API', '') if not eregs_api: logger.error("Regs could not be indexed, environment variable FEC_EREGS_API not set.") return logger.info("Indexing regulations") reg_versions = requests.get(eregs_api + 'regulation').json()['versions'] es = utils.get_elasticsearch_connection() reg_count = 0 for reg in reg_versions: url = '%sregulation/%s/%s' % (eregs_api, reg['regulation'], reg['version']) regulation = requests.get(url).json() sections = get_sections(regulation) logger.debug("Loading part %s" % reg['regulation']) for section_label in sections: doc_id = '%s_%s' % (section_label[0], section_label[1]) section_formatted = '%s-%s' % (section_label[0], section_label[1]) reg_url = '/regulations/{0}/{1}#{0}'.format(section_formatted, reg['version']) no = '%s.%s' % (section_label[0], section_label[1]) name = sections[section_label]['title'].split(no)[1].strip() doc = { "doc_id": doc_id, "name": name, "text": sections[section_label]['text'], "url": reg_url, "no": no, "sort1": int(section_label[0]), "sort2": int(section_label[1]) } es.index('docs_index', 'regulations', doc, id=doc['doc_id']) reg_count += 1 logger.info("%d regulation parts indexed", reg_count)
def load_cases(case_type, case_no=None): """ Reads data for current MURs, AFs, and ADRs from a Postgres database, assembles a JSON document corresponding to the case, and indexes this document in Elasticsearch in the index `docs_index` with a doc_type of `murs`, `adrs`, or `admin_fines`. In addition, all documents attached to the case are uploaded to an S3 bucket under the _directory_ `legal/<doc_type>/<id>/`. """ if case_type in ('MUR', 'ADR', 'AF'): es = get_elasticsearch_connection() logger.info("Loading {0}(s)".format(case_type)) case_count = 0 for case in get_cases(case_type, case_no): if case is not None: logger.info("Loading {0}: {1}".format(case_type, case['no'])) es.index('docs_index', get_es_type(case_type), case, id=case['doc_id']) case_count += 1 logger.info("{0} {1}(s) loaded".format(case_count, case_type))
def load_archived_murs(mur_no=None): """ Reads data for Archived MURs from a Postgres database (under schema:mur_arch), assembles a JSON document corresponding to the mur, and indexes this document in Elasticsearch in the index `archived_murs` with a doc_type of `murs`. """ es = utils.get_elasticsearch_connection() mur_count = 0 for mur in get_murs(mur_no): if mur is not None: logger.info("Loading archived MUR No: {0}".format(mur["no"])) es.index("archived_murs", get_es_type(), mur, id=mur["doc_id"]) mur_count += 1 logger.info("{0} Archived Mur(s) loaded".format(mur_count)) else: logger.error("Invalid archived MUR") # ==for dubug use, display the JSON format of object "mur" logger.debug("mur_json_data =" + json.dumps(mur, indent=4, cls=DateTimeEncoder))
def move_aliases_to_docs_index(): """ Move `docs_index` and `docs_search` aliases to point to the `docs` index. Delete index `docs_staging`. """ es = utils.get_elasticsearch_connection() logger.info("Move aliases 'docs_index' and 'docs_search' to point to 'docs'") es.indices.update_aliases( body={ "actions": [ {"remove": {"index": 'docs_staging', "alias": 'docs_index'}}, {"remove": {"index": 'docs_staging', "alias": 'docs_search'}}, {"add": {"index": 'docs', "alias": 'docs_index'}}, {"add": {"index": 'docs', "alias": 'docs_search'}}, ] } ) logger.info("Delete index 'docs_staging'") es.indices.delete('docs_staging')
def get_title_52_statutes(): es = utils.get_elasticsearch_connection() title_parsed = get_xml_tree_from_url('http://uscode.house.gov/download/' + 'releasepoints/us/pl/114/219/[email protected]') tag_name = '{{http://xml.house.gov/schemas/uslm/1.0}}{0}' section_count = 0 for subtitle in title_parsed.iter(tag_name.format('subtitle')): if subtitle.attrib['identifier'] == '/us/usc/t52/stIII': for subchapter in subtitle.iter(tag_name.format('subchapter')): match = re.match("/us/usc/t52/stIII/ch([0-9]+)/sch([IVX]+)", subchapter.attrib['identifier']) chapter = match.group(1) subchapter_no = match.group(2) for section in subchapter.iter(tag_name.format('section')): text = '' for child in section.iter(): if child.text: text += ' %s ' % child.text.strip() heading = section.find(tag_name.format('heading')).text.strip() section_no = re.match('/us/usc/t52/s([0-9]+)', section.attrib['identifier']).group(1) pdf_url = 'http://api.fdsys.gov/link?collection=uscode&' +\ 'title=52&year=mostrecent§ion=%s'\ % section_no doc = { "doc_id": section.attrib['identifier'], "text": text, "name": heading, "no": section_no, "title": "52", "chapter": chapter, "subchapter": subchapter_no, "url": pdf_url, "sort1": 52, "sort2": int(section_no) } es.index('docs_index', 'statutes', doc, id=doc['doc_id']) section_count += 1 return section_count
def restore_elasticsearch_backup(repository_name=None, snapshot_name=None): ''' Restore elasticsearch from backup in the event of catastrophic failure at the infrastructure layer or user error. -Delete docs index -Restore from elasticsearch snapshot -Default to most recent snapshot, optionally specify `snapshot_name` ''' es = utils.get_elasticsearch_connection() repository_name = repository_name or BACKUP_REPOSITORY_NAME configure_backup_repository(repository_name) most_recent_snapshot_name = get_most_recent_snapshot(repository_name) snapshot_name = snapshot_name or most_recent_snapshot_name if es.indices.exists('docs'): logger.info( 'Found docs index. Creating staging index for zero-downtime restore' ) create_staging_index() delete_all_indices() logger.info("Retrieving snapshot: {0}".format(snapshot_name)) body = {"indices": "docs,archived_murs"} result = es.snapshot.restore( repository=BACKUP_REPOSITORY_NAME, snapshot=snapshot_name, body=body ) if result.get('accepted'): logger.info("Successfully restored snapshot: {0}".format(snapshot_name)) if es.indices.exists('docs_staging'): move_aliases_to_docs_index() else: logger.error("Unable to restore snapshot: {0}".format(snapshot_name)) logger.info( "You may want to try the most recent snapshot: {0}".format( most_recent_snapshot_name ) )
def load_cases(case_type, case_no=None): """ Reads data for current MURs, AFs, and ADRs from a Postgres database, assembles a JSON document corresponding to the case, and indexes this document in Elasticsearch in the index `docs_index` with a doc_type of `murs`, `adrs`, or `admin_fines`. In addition, all documents attached to the case are uploaded to an S3 bucket under the _directory_ `legal/<doc_type>/<id>/`. """ if case_type in ('MUR', 'ADR', 'AF'): es = get_elasticsearch_connection() logger.info("Loading {0}(s)".format(case_type)) case_count = 0 for case in get_cases(case_type, case_no): if case is not None: if case.get('published_flg'): logger.info("Loading {0}: {1}".format( case_type, case['no'])) es.index('docs_index', get_es_type(case_type), case, id=case['doc_id']) case_count += 1 logger.info("{0} {1}(s) loaded".format( case_count, case_type)) else: logger.info( "Found an unpublished case - deleting {0}: {1} from ES" .format(case_type, case['no'])) es.delete_by_query( index='docs_index', body={'query': { "term": { "no": case['no'] } }}, doc_type=get_es_type(case_type)) logger.info('Successfully deleted {} {} from ES'.format( case_type, case['no']))
def create_staging_index(): """ Create the index `docs_staging`. Move the alias docs_index to point to `docs_staging` instead of `docs`. """ es = utils.get_elasticsearch_connection() try: logger.info("Delete index 'docs_staging'") es.indices.delete('docs_staging') except: pass logger.info("Create index 'docs_staging'") es.indices.create('docs_staging', { "mappings": MAPPINGS, "settings": ANALYZER_SETTINGS, }) logger.info("Move alias 'docs_index' to point to 'docs_staging'") es.indices.update_aliases(body={"actions": [ {"remove": {"index": 'docs', "alias": 'docs_index'}}, {"add": {"index": 'docs_staging', "alias": 'docs_index'}} ]})
import re from elasticsearch_dsl import Search, Q from webargs import fields from flask import abort from webservices import args from webservices import utils from webservices.utils import use_kwargs from elasticsearch import RequestError from webservices.exceptions import ApiError import logging es = utils.get_elasticsearch_connection() logger = logging.getLogger(__name__) INNER_HITS = { "_source": False, "highlight": { "require_field_match": False, "fields": { "documents.text": {}, "documents.description": {} } } } class GetLegalCitation(utils.Resource): @property
import re from elasticsearch_dsl import Search, Q from webargs import fields from flask import abort from webservices import args from webservices import utils from webservices.utils import use_kwargs from webservices.legal_docs import DOCS_SEARCH from elasticsearch import RequestError from webservices.exceptions import ApiError import logging es = utils.get_elasticsearch_connection() logger = logging.getLogger(__name__) INNER_HITS = { "_source": False, "highlight": { "require_field_match": False, "fields": { "documents.text": {}, "documents.description": {} } } } class GetLegalCitation(utils.Resource):
def delete_from_es(index, doc_type): """ Deletes all documents with the given `doc_type` from Elasticsearch """ es = utils.get_elasticsearch_connection() es.delete_by_query(index=index, body={'query': {'match_all': {}}}, doc_type=doc_type)
def get_citations(ao_names): ao_component_to_name_map = { tuple(map(int, a.split('-'))): a for a in ao_names } logger.info("Getting citations...") rs = db.engine.execute("""SELECT ao_no, ocrtext FROM aouser.document INNER JOIN aouser.ao USING (ao_id) WHERE category = 'Final Opinion'""") all_regulatory_citations = set() all_statutory_citations = set() raw_citations = defaultdict(lambda: defaultdict(set)) for row in rs: logger.debug("Getting citations for AO %s" % row["ao_no"]) ao_citations_in_doc = parse_ao_citations(row["ocrtext"], ao_component_to_name_map) ao_citations_in_doc.discard(row["ao_no"]) # Remove self raw_citations[row["ao_no"]]["ao"].update(ao_citations_in_doc) for citation in ao_citations_in_doc: raw_citations[citation]["aos_cited_by"].add(row["ao_no"]) statutory_citations = parse_statutory_citations(row["ocrtext"]) regulatory_citations = parse_regulatory_citations(row["ocrtext"]) all_statutory_citations.update(statutory_citations) all_regulatory_citations.update(regulatory_citations) raw_citations[row["ao_no"]]["statutes"].update(statutory_citations) raw_citations[row["ao_no"]]["regulations"].update(regulatory_citations) citations = defaultdict(lambda: defaultdict(list)) for ao in raw_citations: citations[ao]["ao"] = sorted([{ "no": c, "name": ao_names[c] } for c in raw_citations[ao]["ao"]], key=lambda d: d["no"]) citations[ao]["aos_cited_by"] = sorted([{ "no": c, "name": ao_names[c] } for c in raw_citations[ao]["aos_cited_by"]], key=lambda d: d["no"]) citations[ao]["statutes"] = sorted([{ "title": c[0], "section": c[1] } for c in raw_citations[ao]["statutes"]], key=lambda d: (d["title"], d["section"])) citations[ao]["regulations"] = sorted( [{ "title": c[0], "part": c[1], "section": c[2] } for c in raw_citations[ao]["regulations"]], key=lambda d: (d["title"], d["part"], d["section"])) es = get_elasticsearch_connection() for citation in all_regulatory_citations: entry = { 'citation_text': '%d CFR §%d.%d' % (citation[0], citation[1], citation[2]), 'citation_type': 'regulation' } es.index('docs_index', 'citations', entry, id=entry['citation_text']) for citation in all_statutory_citations: entry = { 'citation_text': '%d U.S.C. §%d' % (citation[0], citation[1]), 'citation_type': 'statute' } es.index('docs_index', 'citations', entry, id=entry['citation_text']) logger.info("Citations loaded.") return citations