Пример #1
0
def index_inventory_pages_from_scans(es_anno: Elasticsearch,
                                     inventory_num: int):
    inv_config = set_config_inventory_num(inventory_num, ocr_type="pagexml")
    inv_metadata = rep_es.retrieve_inventory_metadata(
        es_anno, inv_config["inventory_num"], inv_config)
    page_type_index = rep_es.get_per_page_type_index(inv_metadata)
    query = rep_es.make_inventory_query(inventory_num)
    del query['size']
    for hi, hit in enumerate(
            rep_es.scroll_hits(es_anno, query, index='scans', size=2)):
        scan_doc = json_to_pagexml_scan(hit['_source'])
        pages_doc = pagexml_parser.split_pagexml_scan(scan_doc)
        for page_doc in pages_doc:
            if page_doc.metadata['page_num'] not in page_type_index:
                page_doc.metadata['type'] = "empty_page"
                print("page without page_num:", page_doc.id)
                print("\tpage stats:", page_doc.stats)
            else:
                page_doc.metadata['type'] = [
                    page_doc.metadata['type'],
                    page_type_index[page_doc.metadata['page_num']]
                ]
            page_doc.metadata['index_timestamp'] = datetime.datetime.now()
            es_anno.index(index=inv_config['page_index'],
                          id=page_doc.id,
                          body=page_doc.json)
        if (hi + 1) % 100 == 0:
            print(hi + 1, "scans processed")
Пример #2
0
def index_inventory_from_text_repo(es,
                                   inv_num,
                                   inventory_config: Dict[str, any],
                                   ignore_version: bool = False):
    text_repo = TextRepo(text_repo_url)
    inventory_metadata = rep_es.retrieve_inventory_metadata(
        es, inv_num, inventory_config)
    page_type_index = get_per_page_type_index(inventory_metadata)
    if "num_scans" not in inventory_metadata:
        return None
    for scan_num in range(1, inventory_metadata["num_scans"] + 1):
        scan_doc = rep_es.parse_latest_version(es,
                                               text_repo,
                                               scan_num,
                                               inventory_metadata,
                                               inventory_config,
                                               ignore_version=ignore_version)
        if not scan_doc:
            continue
        print("Indexing scan", scan_doc["metadata"]["id"])
        index_scan(es, scan_doc, inventory_config)
        if 'double_page' not in scan_doc['metadata']['scan_type']:
            continue
        if inventory_config['ocr_type'] == 'hocr':
            pages_doc = hocr_page_parser.parse_double_page_scan(
                scan_doc, inventory_config)
        else:
            pages_doc = pagexml_parser.split_pagexml_scan(scan_doc)
        for page_doc in pages_doc:
            page_doc['metadata']['page_type'] = get_pagexml_page_type(
                page_doc, page_type_index)
            page_doc["version"] = scan_doc["version"]
            index_page(es, page_doc, inventory_config)
Пример #3
0
def index_inventory_from_zip(es: Elasticsearch, inventory_num: int,
                             inventory_config: dict):
    inv_metadata = rep_es.retrieve_inventory_metadata(es, inventory_num,
                                                      inventory_config)
    page_type_index = get_per_page_type_index(inv_metadata)
    text_repo = TextRepo(text_repo_url)
    for scan_doc in inv_parser.parse_inventory_from_zip(
            inventory_num, inventory_config):
        version_info = text_repo.get_last_version_info(
            scan_doc["metadata"]["id"], file_type=inventory_config['ocr_type'])
        scan_doc["version"] = version_info
        if not scan_doc:
            continue
        print("Indexing scan", scan_doc["metadata"]["id"])
        index_scan(es, scan_doc, inventory_config)
        if 'double_page' not in scan_doc['metadata']['scan_type']:
            continue
        if inventory_config['ocr_type'] == 'hocr':
            pages_doc = hocr_page_parser.parse_double_page_scan(
                scan_doc, inventory_config)
        else:
            pages_doc = pagexml_parser.split_pagexml_scan(scan_doc)
        for page_doc in pages_doc:
            page_doc.metadata["version"] = version_info
            page_doc.metadata['type'] = [
                page_doc.metadata['type'],
                page_type_index[page_doc.metadata['page_num']]
            ]
            index_page(es, page_doc.json, inventory_config)
Пример #4
0
def index_inventory_sessions_with_lines(es_anno: Elasticsearch, inv_num: int,
                                        config: dict) -> None:
    inv_metadata = rep_es.retrieve_inventory_metadata(es_anno, inv_num, config)
    pages = rep_es.retrieve_resolution_pages(es_anno, inv_num, config)
    pages.sort(key=lambda page: page.metadata['page_num'])
    for mi, session in enumerate(
            session_parser.get_sessions(pages, config, inv_metadata)):
        print('session received from get_sessions:', session.id)
        date_string = None
        for match in session.evidence:
            if match.has_label('session_date'):
                date_string = match.string
        print('\tdate string:', date_string)
        es_anno.index(index='session_lines', id=session.id, body=session.json)
Пример #5
0
def add_pagexml_page_types(es: Elasticsearch, inv_config: dict) -> None:
    inv_metadata = rep_es.retrieve_inventory_metadata(
        es, inv_config["inventory_num"], inv_config)
    page_type_index = get_per_page_type_index(inv_metadata)
    pages = rep_es.retrieve_inventory_pages(es, inv_config["inventory_num"],
                                            inv_config)
    for pi, page in enumerate(
            sorted(pages, key=lambda x: x.metadata['page_num'])):
        page.metadata['page_type'] = get_pagexml_page_type(
            page, page_type_index)
        add_timestamp(page)
        es.index(index=inv_config["page_index"],
                 id=page.metadata['id'],
                 body=page.json())
        print(page.metadata['id'], page.metadata["page_type"])
Пример #6
0
def index_sessions_inventory_old(es: Elasticsearch, inv_num: int,
                                 inv_config: dict) -> None:
    # pages = retrieve_pagexml_resolution_pages(es, inv_num, inv_config)
    pages = rep_es.retrieve_resolution_pages(es, inv_num, inv_config)
    pages.sort(key=lambda page: page.metadata['page_num'])
    inv_metadata = rep_es.retrieve_inventory_metadata(es, inv_num, inv_config)
    prev_date: RepublicDate = make_republic_date(inv_metadata['period_start'])
    if not pages:
        print('No pages retrieved for inventory', inv_num)
        return None
    for mi, session in enumerate(
            session_parser.get_sessions(pages, inv_config, inv_metadata)):
        print(json.dumps(session.metadata, indent=4))
        if session.metadata['num_lines'] > 4000:
            # exceptionally long session docs probably contain multiple sessions
            # so quarantine these
            session.metadata['date_shift_status'] = 'quarantined'
            # print('Error: too many lines for session on date', session.metadata['session_date'])
            # continue
        session_date_string = 'None'
        for missing_session in add_missing_dates(prev_date, session):
            add_timestamp(missing_session)
            es.index(index=inv_config['session_index'],
                     doc_type=inv_config['session_doc_type'],
                     id=missing_session.metadata['id'],
                     body=missing_session.json(with_columns=True,
                                               with_scan_versions=True))

        session.scan_versions = get_session_scans_version(session)
        session_parser.clean_lines(session.lines, clean_copy=False)
        if session.metadata['has_session_date_element']:
            for evidence in session.evidence:
                if evidence['metadata_field'] == 'session_date':
                    session_date_string = evidence['matches'][-1][
                        'match_string']
        page_num = int(
            session.columns[0]['metadata']['page_id'].split('page-')[1])
        num_lines = session.metadata['num_lines']
        session_id = session.metadata['id']
        print(
            f"{mi}\t{session_id}\t{session_date_string: <30}\tnum_lines: {num_lines}\tpage: {page_num}"
        )

        # print('Indexing session on date', session.metadata['session_date'],
        #      '\tdate_string:', session_date_string,
        #      '\tnum session lines:', session.metadata['num_lines'])
        prev_date = session.date
        try:
            add_timestamp(session)
            if session.metadata['date_shift_status'] == 'quarantined':
                quarantine_index = inv_config['session_index'] + '_quarantine'
                es.index(index=quarantine_index,
                         doc_type=inv_config['session_doc_type'],
                         id=session.metadata['id'],
                         body=session.json(with_columns=True,
                                           with_scan_versions=True))
            else:
                es.index(index=inv_config['session_index'],
                         doc_type=inv_config['session_doc_type'],
                         id=session.metadata['id'],
                         body=session.json(with_columns=True,
                                           with_scan_versions=True))
        except RequestError:
            print('skipping doc')
            continue
    return None