Пример #1
0
def run(
    db_name=db_name,
    experts_etl_logger=None,
    smtp_server=None,
    from_address=None,
    ticket_address=None
):
    # This connection API in general needs work. Including this here for the sake of consistency
    # with other ETL module.run() functions.
    sqlapi.setengine(db.engine(db_name))

    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info('starting: error reporting', extra={'pure_sync_job': 'error_reporting'})

    if smtp_server is None:
        smtp_server = smtplib.SMTP('localhost', 25)
    if from_address is None:
        from_address = os.environ.get('EXPERTS_ETL_FROM_EMAIL_ADDRESS')
    if ticket_address is None:
        ticket_address = os.environ.get('EXPERTS_ETL_TICKET_EMAIL_ADDRESS')

    with sqlapi.transaction():
        report_via_email(smtp_server=smtp_server, from_address=from_address, ticket_address=ticket_address)
    smtp_server.quit()

    experts_etl_logger.info('ending: error reporting', extra={'pure_sync_job': 'error_reporting'})
Пример #2
0
def run(
        # Do we need other default functions here?
        #extract_api_persons=extract_api_persons,
        db_name=db_name,
        transaction_record_limit=transaction_record_limit,
        experts_etl_logger=None):
    # This connection API in general needs work. Including this here for the sake of consistency
    # with other ETL module.run() functions.
    sqlapi.setengine(db.engine(db_name))

    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
        experts_etl_logger.info('starting: oit -> edw',
                                extra={'pure_sync_job': 'person'})

    engine = db.engine(db_name)
    prepare_target_scratch_tables(engine)

    with db.session(db_name) as session:
        prepare_source_tables(engine, session)

        load_count = 0
        for demog in session.query(
                PureEligibleDemogChngHst.emplid).distinct().all():
            # Stop-gap prevention of unique key constraint violation:
            if demog.emplid == '8004768':
                continue
            person_dict = transform(session, extract(session, demog.emplid))
            if len(person_dict['jobs']) == 0 and len(
                    person_dict['programs']) == 0:
                record_person_no_job_data_error(
                    session=session,
                    emplid=demog.emplid,
                    internet_id=person_dict['internet_id'],
                )
                continue
            load_into_scratch(session, person_dict)
            load_count += 1
            if load_count >= transaction_record_limit:
                session.commit()
                load_count = 0

        update_targets_from_scratch()

        session.commit()

    experts_etl_logger.info('ending: oit -> edw',
                            extra={'pure_sync_job': 'person'})
Пример #3
0
def run(db_name=db_name,
        template=template,
        output_filename=output_filename,
        experts_etl_logger=None):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info('starting: edw -> pure',
                            extra={'pure_sync_job': 'person'})

    with open(output_filename, 'w') as output_file:
        output_file.write(
            '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n')
        output_file.write(
            '<persons xmlns="v1.unified-person-sync.pure.atira.dk" xmlns:v3="v3.commons.pure.atira.dk">\n'
        )
        with db.session(db_name) as session:
            for person in session.query(PureSyncPersonData).all():
                person_dict = {
                    c.name: getattr(person, c.name)
                    for c in person.__table__.columns
                }
                person_dict['jobs'] = []
                for job in session.query(PureSyncStaffOrgAssociation).filter(
                        PureSyncStaffOrgAssociation.person_id ==
                        person.person_id).all():
                    job_dict = {
                        c.name: getattr(job, c.name)
                        for c in job.__table__.columns
                    }
                    person_dict['jobs'].append(job_dict)
                person_dict['programs'] = []
                for program in session.query(
                        PureSyncStudentOrgAssociation).filter(
                            PureSyncStudentOrgAssociation.person_id ==
                            person.person_id).all():
                    program_dict = {
                        c.name: getattr(program, c.name)
                        for c in program.__table__.columns
                    }
                    person_dict['programs'].append(program_dict)
                output_file.write(template.render(person_dict))

        output_file.write('</persons>')

    experts_etl_logger.info('ending: edw -> pure',
                            extra={'pure_sync_job': 'person'})
Пример #4
0
def run(experts_etl_logger=None,
        sync_dir=sync_dir,
        sync_types=sync_types,
        keep_limit=keep_limit):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info('starting: sync file rotation',
                            extra={'pure_sync_job': 'sync_file_rotator'})

    deleted_filenames = []
    for sync_type in sync_types:
        deleted_filenames.extend(
            rotate(sync_type, sync_dir=sync_dir, keep_limit=keep_limit))

    experts_etl_logger.info('ending: sync file rotation',
                            extra={'pure_sync_job': 'sync_file_rotator'})

    return deleted_filenames
Пример #5
0
def run(
    startdate=None,
    startdate_str=None, # yyyy-MM-dd or yyyy-MM-dd_HH-mm-ss format
    db_name=db_name,
    transaction_record_limit=transaction_record_limit,
    experts_etl_logger=None,
    pure_api_config=None
):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info('starting: extracting/loading', extra={'pure_api_record_type': pure_api_record_type})

    if pure_api_config is None:
        pure_api_config = Config()

    with db.session(db_name) as session:
        record_count = 0
        if startdate_str is None:
            startdate_str = ensure_valid_startdate(session, startdate).isoformat()
        for api_change in client.get_all_changes_transformed(startdate_str, config=pure_api_config):
            if not required_fields_exist(api_change):
                continue
            if api_change.familySystemName not in family_system_name_db_class_map:
                continue
            if matching_db_record_exists(session, api_change) \
                and previously_processed_same_or_newer_change(session, api_change):
                continue
            if already_loaded_same_or_newer_change(session, api_change):
                continue

            load_api_change(session, api_change)

            record_count += 1
            if (record_count % transaction_record_limit) == 0:
                session.commit()

    experts_etl_logger.info('ending: extracting/loading', extra={'pure_api_record_type': pure_api_record_type})
Пример #6
0
def run(
        # Do we need other default functions here?
        #extract_api_changes=extract_api_changes,
        db_name=db_name,
        transaction_record_limit=transaction_record_limit,
        experts_etl_logger=None,
        pure_api_config=None):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info(
        'starting: extracting/loading',
        extra={'pure_api_record_type': pure_api_record_type})

    if pure_api_config is None:
        pure_api_config = Config()

    # Capture the current record for each iteration, so we can log it in case of an exception:
    latest_change = None

    try:
        with db.session(db_name) as session:
            processed_changes = []
            for changes in changes_for_family_ordered_by_uuid_version(
                    session, 'ExternalPerson'):
                latest_change = changes[0]
                db_person = get_db_person(session, latest_change.uuid)

                # We delete here and continue, because there will be no record
                # to download from the Pure API when it has been deleted.
                if latest_change.change_type == 'DELETE':
                    if db_person:
                        delete_db_person(session, db_person)
                    processed_changes.extend(changes)
                    if len(processed_changes) >= transaction_record_limit:
                        record_changes_as_processed(session, processed_changes)
                        processed_changes = []
                        session.commit()
                    continue

                r = None
                try:
                    r = client.get(pure_api_record_type + '/' +
                                   latest_change.uuid,
                                   config=pure_api_config)
                except PureAPIHTTPError as e:
                    if e.response.status_code == 404:
                        if db_person:
                            # This record has been deleted from Pure but still exists in our local db:
                            delete_db_person(session, db_person)
                        processed_changes.extend(changes)
                        if len(processed_changes) >= transaction_record_limit:
                            record_changes_as_processed(
                                session, processed_changes)
                            processed_changes = []
                            session.commit()
                    else:
                        experts_etl_logger.error(
                            f'HTTP error {e.response.status_code} returned during record extraction',
                            extra={
                                'pure_uuid': latest_change.uuid,
                                'pure_api_record_type': pure_api_record_type
                            })
                    continue
                except PureAPIRequestException as e:
                    formatted_exception = loggers.format_exception(e)
                    experts_etl_logger.error(
                        f'mysterious client request exception encountered during record extraction: {formatted_exception}',
                        extra={
                            'pure_uuid': latest_change.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue
                except Exception:
                    raise

                api_external_person = response.transform(
                    pure_api_record_type,
                    r.json(),
                    version=pure_api_config.version)

                delete_merged_records(session, api_external_person)

                if db_person and db_person_same_or_newer_than_api_external_person(
                        session, db_person, api_external_person):
                    continue
                if already_loaded_same_api_external_person(
                        session, api_external_person):
                    continue
                load_api_external_person(session, api_external_person, r.text)

                processed_changes.extend(changes)
                if len(processed_changes) >= transaction_record_limit:
                    record_changes_as_processed(session, processed_changes)
                    processed_changes = []
                    session.commit()

            record_changes_as_processed(session, processed_changes)
            session.commit()

    except Exception as e:
        formatted_exception = loggers.format_exception(e)
        experts_etl_logger.error(
            f'exception encountered during record extraction: {formatted_exception}',
            extra={
                'pure_uuid': latest_change.uuid,
                'pure_api_record_type': pure_api_record_type
            })

    experts_etl_logger.info(
        'ending: extracting/loading',
        extra={'pure_api_record_type': pure_api_record_type})
Пример #7
0
def run(
        startdate=None,
        startdate_str=None,  # yyyy-MM-dd or yyyy-MM-dd_HH-mm-ss format
        transaction_record_limit=transaction_record_limit,
        experts_etl_logger=None,
        api_version=None):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info(
        'starting: extracting/loading raw json',
        extra={'pure_api_record_type': pure_api_record_type})

    client_config = client.Config() if api_version is None else client.Config(
        version=api_version)
    api_version = client_config.version

    with db.cx_oracle_connection() as connection:
        try:
            cursor = connection.cursor()
            family_system_names = pure_json.collection_family_system_names_for_api_version(
                cursor, api_version=api_version)

            if startdate_str is None:
                startdate_str = ensure_valid_startdate(cursor, api_version,
                                                       startdate).isoformat()

            documents_to_insert = {}
            for api_document in client.get_all_changes_transformed(
                    startdate_str, config=client_config):
                if not required_fields_exist(api_document):
                    continue
                if api_document.familySystemName not in family_system_names:
                    continue
                if same_or_newer_db_change_exists(cursor, api_document,
                                                  api_version):
                    continue

                # We make this a dict with unique keys to avoid attempts to insert duplicates:
                documents_to_insert[
                    f'{api_document.uuid}:{api_document.version}'] = {
                        'uuid': api_document.uuid,
                        'pure_version': api_document.version,
                        'change_type': api_document.changeType,
                        'family_system_name': api_document.familySystemName,
                        'inserted': datetime.now(),
                        'json_document': json.dumps(api_document),
                    }

                if len(documents_to_insert) % transaction_record_limit == 0:
                    pure_json.insert_change_documents(
                        cursor,
                        documents=list(documents_to_insert.values()),
                        api_version=api_version)
                    documents_to_insert = {}
                    connection.commit()

            if documents_to_insert:
                pure_json.insert_change_documents(
                    cursor,
                    documents=list(documents_to_insert.values()),
                    api_version=api_version)
            connection.commit()

        except Exception as e:
            connection.rollback()
            formatted_exception = loggers.format_exception(e)
            experts_etl_logger.error(
                f'exception encountered during extractin/loading raw json: {formatted_exception}',
                extra={'pure_api_record_type': pure_api_record_type})

    experts_etl_logger.info(
        'ending: extracting/loading raw json',
        extra={'pure_api_record_type': pure_api_record_type})
def run(
        # Do we need other default functions here?
        extract_api_persons=extract_api_persons,
        db_name=db_name,
        transaction_record_limit=transaction_record_limit,
        pure_api_record_logger=pure_api_record_logger,
        experts_etl_logger=None,
        pure_api_config=None):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info(
        'starting: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})

    if pure_api_config is None:
        pure_api_config = Config()

    # Capture the current record for each iteration, so we can log it in case of an exception:
    api_person = None

    try:
        with db.session(db_name) as session:
            processed_api_person_uuids = []
            for db_api_person in extract_api_persons(session):
                api_person = response.transform(
                    pure_api_record_type,
                    json.loads(db_api_person.json),
                    version=pure_api_config.version)
                db_person = get_db_person(session, db_api_person.uuid)
                db_person_previously_existed = False
                if db_person:
                    db_person_previously_existed = True
                    if db_person.pure_modified and db_person.pure_modified >= db_api_person.modified:
                        # Skip this record, since we already have a newer one:
                        processed_api_person_uuids.append(db_api_person.uuid)
                        continue
                else:
                    db_person = create_db_person(api_person)

                db_person.internet_id = None
                db_person.first_name = api_person.name.firstName
                db_person.last_name = api_person.name.lastName
                db_person.pure_modified = db_api_person.modified

                # Doubt that we will ever get these for external persons:
                db_person.orcid = None
                db_person.hindex = None

                # Check for orgs not in EDW yet:

                api_org_uuids = set()
                for org_assoc in api_person.externalOrganisations:
                    api_org_uuids.add(org_assoc.uuid)

                db_org_uuids = set()
                if db_person_previously_existed:
                    # Avoid trying to query a person that doesn't exist in the db yet:
                    db_org_uuids = {
                        db_org.pure_uuid
                        for db_org in db_person.pure_orgs
                    }

                api_only_org_uuids = api_org_uuids - db_org_uuids
                db_only_org_uuids = db_org_uuids - api_org_uuids

                # For now, skip this person if there are any orgs referenced in the api record
                # that we don't have in EDW:
                if len(api_only_org_uuids) > 0:
                    api_only_orgs_in_db = session.query(PureOrg).filter(
                        PureOrg.pure_uuid.in_(api_only_org_uuids)).all()
                    if len(api_only_org_uuids) > len(api_only_orgs_in_db):
                        experts_etl_logger.info(
                            'skipping updates: some associated orgs do not exist in EDW.',
                            extra={
                                'pure_uuid': api_person.uuid,
                                'pure_api_record_type': pure_api_record_type
                            })
                        continue

                # Now we can add the person to the session, because there are no other
                # reasons for intentionally skipping it:
                session.add(db_person)

                ## person pure orgs

                for org_uuid in api_only_org_uuids:
                    person_pure_org = PersonPureOrg(
                        person_uuid=db_person.uuid,
                        pure_org_uuid=org_uuid,
                    )
                    session.add(person_pure_org)

                session.query(PersonPureOrg).filter(
                    PersonPureOrg.person_uuid == db_person.uuid,
                    PersonPureOrg.pure_org_uuid.in_(db_only_org_uuids)).delete(
                        synchronize_session=False)

                ## scopus ids

                db_scopus_ids = set()
                if db_person_previously_existed:
                    # Avoid trying to query a person that doesn't exist in the db yet:
                    db_scopus_ids = set(db_person.scopus_ids)
                person_ids = get_person_ids(api_person)
                api_only_scopus_ids = person_ids['scopus_ids'] - db_scopus_ids
                db_only_scopus_ids = db_scopus_ids - person_ids['scopus_ids']

                for scopus_id in api_only_scopus_ids:
                    person_scopus_id = PersonScopusId(
                        person_uuid=db_person.uuid,
                        scopus_id=scopus_id,
                    )
                    session.add(person_scopus_id)

                session.query(PersonScopusId).filter(
                    PersonScopusId.person_uuid == db_person.uuid,
                    PersonScopusId.scopus_id.in_(db_only_scopus_ids)).delete(
                        synchronize_session=False)

                processed_api_person_uuids.append(api_person.uuid)
                if len(processed_api_person_uuids) >= transaction_record_limit:
                    mark_api_persons_as_processed(session,
                                                  pure_api_record_logger,
                                                  processed_api_person_uuids)
                    processed_api_person_uuids = []
                    session.commit()

            mark_api_persons_as_processed(session, pure_api_record_logger,
                                          processed_api_person_uuids)
            session.commit()

    except Exception as e:
        formatted_exception = loggers.format_exception(e)
        experts_etl_logger.error(
            f'exception encountered during record transformation: {formatted_exception}',
            extra={
                'pure_uuid': api_person.uuid,
                'pure_api_record_type': pure_api_record_type
            })

    loggers.rollover(pure_api_record_logger)
    experts_etl_logger.info(
        'ending: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})
Пример #9
0
def run(
        # Do we need other default functions here?
        extract_api_pubs=extract_api_pubs,
        db_name=db_name,
        transaction_record_limit=transaction_record_limit,
        pure_api_record_logger=pure_api_record_logger,
        experts_etl_logger=None,
        pure_api_config=None,
        force_load=False):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info(
        'starting: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})

    if pure_api_config is None:
        pure_api_config = Config()

    # Capture the current record for each iteration, so we can log it in case of an exception:
    api_pub = None

    try:
        with db.session(db_name) as session:
            processed_api_pub_uuids = []
            for db_api_pub in extract_api_pubs(session):
                api_pub = response.transform(pure_api_record_type,
                                             json.loads(db_api_pub.json),
                                             version=pure_api_config.version)
                db_pub = get_db_pub(session, db_api_pub.uuid)
                db_pub_previously_existed = False
                if db_pub:
                    db_pub_previously_existed = True
                    if db_pub.pure_modified and db_pub.pure_modified >= db_api_pub.modified and not force_load:
                        # Skip this record, since we already have a newer one:
                        processed_api_pub_uuids.append(db_api_pub.uuid)
                        continue
                else:
                    db_pub = create_db_pub(api_pub)

                pub_ids = get_pub_ids(api_pub)
                db_pub.scopus_id = pub_ids['scopus_id']
                db_pub.pmid = pub_ids['pmid']
                db_pub.doi = pub_ids['doi']

                # Commented out for now, because we will rely more on pure types and subtypes (below):
                #db_pub.type = 'article-journal'

                type_uri_parts = api_pub.type.uri.split('/')
                type_uri_parts.reverse()
                pure_subtype, pure_type, pure_parent_type = type_uri_parts[0:3]
                db_pub.pure_type = pure_type
                db_pub.pure_subtype = pure_subtype

                db_pub.title = api_pub.title.value

                db_pub.container_title = api_pub.journalAssociation.title.value
                db_pub.issn = api_pub.journalAssociation.issn.value if 'issn' in api_pub.journalAssociation else None

                nullify_pub_states(db_pub)
                for api_pub_state in api_pub.publicationStatuses:
                    update_pub_state(db_pub, api_pub_state)

                db_pub.volume = api_pub.volume
                db_pub.issue = api_pub.journalNumber
                db_pub.pages = api_pub.pages
                db_pub.citation_total = api_pub.totalScopusCitations

                db_pub.pure_modified = db_api_pub.modified

                if 'managingOrganisationalUnit' in api_pub:
                    owner_pure_org_uuid = api_pub.managingOrganisationalUnit.uuid
                    owner_pure_org = session.query(PureOrg).filter(
                        PureOrg.pure_uuid ==
                        owner_pure_org_uuid).one_or_none()
                    if owner_pure_org == None:
                        experts_etl_logger.info(
                            'skipping updates: owner pure org does not exist in EDW.',
                            extra={
                                'pure_uuid': api_pub.uuid,
                                'pure_api_record_type': pure_api_record_type
                            })
                        continue
                    db_pub.owner_pure_org_uuid = owner_pure_org_uuid
                else:
                    # TODO: We do this because currently owner_pure_org_uuid is not null. We may want to change that.
                    experts_etl_logger.info(
                        'skipping updates: no owner pure org.',
                        extra={
                            'pure_uuid': api_pub.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue

                ## associations

                author_ordinal = 0
                missing_person = False
                missing_person_pure_uuid = False
                missing_org = False
                pub_author_collabs = []
                all_author_collab_uuids = set()
                all_person_uuids = set()
                pub_persons = []
                pub_person_pure_orgs = []

                # personAssociations can contain authorCollaboration's, which are not persons at all,
                # so we call this variable author_assoc, to be more accurate here:
                for author_assoc in api_pub.personAssociations:
                    author_ordinal += 1

                    if 'authorCollaboration' in author_assoc:
                        author_collab_assoc = author_assoc
                        author_collab_pure_uuid = author_collab_assoc.authorCollaboration.uuid

                        # Sometimes Pure records contain duplicate author collaborations. Grrr...
                        if author_collab_pure_uuid in all_author_collab_uuids:
                            continue
                        all_author_collab_uuids.add(author_collab_pure_uuid)

                        db_author_collab = session.query(
                            AuthorCollaboration).filter(
                                AuthorCollaboration.pure_uuid ==
                                author_collab_pure_uuid).one_or_none()
                        if db_author_collab is None:
                            db_author_collab = AuthorCollaboration(
                                uuid=str(uuid.uuid4()),
                                pure_uuid=author_collab_pure_uuid,
                            )
                        # This is less than ideal, but for now we just update the author collaboration
                        # name with whatever value this record includes:
                        db_author_collab.name = next(
                            (author_collab_text.value for author_collab_text in
                             author_collab_assoc.authorCollaboration.name.text
                             if author_collab_text.locale == 'en_US'), None)
                        session.add(db_author_collab)

                        pub_author_collab = PubAuthorCollaboration(
                            pub_uuid=db_pub.uuid,
                            author_collaboration_uuid=db_author_collab.uuid,
                            author_ordinal=author_ordinal,

                            # TODO: This needs work. We may have tried mapping these to CSL values at
                            # one point, but now we're just taking what Pure gives us.
                            author_role=next(
                                (author_role_text.value for author_role_text in
                                 author_collab_assoc.personRole.term.text
                                 if author_role_text.locale == 'en_US'),
                                None).lower(),
                        )
                        pub_author_collabs.append(pub_author_collab)

                        continue

                    person_assoc = author_assoc
                    person_pure_uuid = None
                    if 'person' in person_assoc:
                        person_pure_uuid = person_assoc.person.uuid
                        person_pure_internal = 'Y'
                    if 'externalPerson' in person_assoc:
                        person_pure_uuid = person_assoc.externalPerson.uuid
                        person_pure_internal = 'N'
                    if person_assoc is not None and person_pure_uuid is None:
                        missing_person_pure_uuid = True
                        break

                    db_person = session.query(Person).filter(
                        Person.pure_uuid == person_pure_uuid).one_or_none()
                    if db_person == None:
                        missing_person = True
                        break

                    if db_person.uuid not in all_person_uuids:
                        pub_person = PubPerson(
                            pub_uuid=db_pub.uuid,
                            person_uuid=db_person.uuid,
                            person_ordinal=author_ordinal,

                            # TODO: This needs work. We may have tried mapping these to CSL values at
                            # one point, but now we're just taking what Pure gives us.
                            person_role=next(
                                (person_role_text.value for person_role_text in
                                 person_assoc.personRole.term.text
                                 if person_role_text.locale == 'en_US'),
                                None).lower(),
                            person_pure_internal=person_pure_internal,
                            first_name=person_assoc.name.firstName
                            if 'firstName' in person_assoc.name else None,
                            last_name=person_assoc.name.lastName
                            if 'lastName' in person_assoc.name else None,
                            emplid=db_person.emplid,
                        )
                        pub_persons.append(pub_person)
                        all_person_uuids.add(db_person.uuid)
                    else:
                        continue

                    all_person_org_uuids = set()
                    for api_pure_org in itertools.chain(
                            person_assoc.organisationalUnits,
                            person_assoc.externalOrganisations):
                        db_pure_org = session.query(PureOrg).filter(
                            PureOrg.pure_uuid ==
                            api_pure_org.uuid).one_or_none()
                        if db_pure_org == None:
                            missing_org = True
                            break

                        person_org_uuids = frozenset(
                            [db_person.uuid, db_pure_org.pure_uuid])
                        if person_org_uuids not in all_person_org_uuids:
                            pub_person_pure_org = PubPersonPureOrg(
                                pub_uuid=db_pub.uuid,
                                person_uuid=db_person.uuid,
                                pure_org_uuid=db_pure_org.pure_uuid,
                            )
                            pub_person_pure_orgs.append(pub_person_pure_org)
                            all_person_org_uuids.add(person_org_uuids)
                        else:
                            continue
                    if missing_org:
                        break

                if missing_person:
                    experts_etl_logger.info(
                        'skipping updates: one or more associated persons do not exist in EDW.',
                        extra={
                            'pure_uuid': api_pub.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue

                if missing_person_pure_uuid:
                    experts_etl_logger.info(
                        'skipping updates: one or more associated persons has no pure uuid.',
                        extra={
                            'pure_uuid': api_pub.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue

                if missing_org:
                    experts_etl_logger.info(
                        'skipping updates: one or more associated orgs do not exist in EDW.',
                        extra={
                            'pure_uuid': api_pub.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue

                # Now we can add the pub to the session, because there are no other
                # reasons for intentionally skipping it:
                session.add(db_pub)

                # Now we can also delete and re-create the associations for this research output:

                session.query(PubAuthorCollaboration).filter(
                    PubAuthorCollaboration.pub_uuid == db_pub.uuid).delete(
                        synchronize_session=False)
                for pub_author_collab in pub_author_collabs:
                    session.add(pub_author_collab)

                session.query(PubPerson).filter(
                    PubPerson.pub_uuid == db_pub.uuid).delete(
                        synchronize_session=False)
                for pub_person in pub_persons:
                    session.add(pub_person)

                session.query(PubPersonPureOrg).filter(
                    PubPersonPureOrg.pub_uuid == db_pub.uuid).delete(
                        synchronize_session=False)
                for pub_person_pure_org in pub_person_pure_orgs:
                    session.add(pub_person_pure_org)

                processed_api_pub_uuids.append(api_pub.uuid)
                if len(processed_api_pub_uuids) >= transaction_record_limit:
                    mark_api_pubs_as_processed(session, pure_api_record_logger,
                                               processed_api_pub_uuids)
                    processed_api_pub_uuids = []
                    session.commit()

            mark_api_pubs_as_processed(session, pure_api_record_logger,
                                       processed_api_pub_uuids)
            session.commit()

    except Exception as e:
        formatted_exception = loggers.format_exception(e)
        experts_etl_logger.error(
            f'exception encountered during record transformation: {formatted_exception}',
            extra={
                'pure_uuid': api_pub.uuid,
                'pure_api_record_type': pure_api_record_type
            })

    loggers.rollover(pure_api_record_logger)
    experts_etl_logger.info(
        'ending: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})
Пример #10
0
def run(
        # Do we need other default functions here?
        extract_api_orgs=extract_api_orgs,
        db_name=db_name,
        transaction_record_limit=transaction_record_limit,
        pure_api_record_logger=pure_api_record_logger,
        experts_etl_logger=None,
        pure_api_config=None):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info(
        'starting: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})

    if pure_api_config is None:
        pure_api_config = Config()

    # Capture the current record for each iteration, so we can log it in case of an exception:
    api_org = None

    try:
        with db.session(db_name) as session:
            processed_api_org_uuids = []
            for db_api_org in extract_api_orgs(session):
                api_org = response.transform(pure_api_record_type,
                                             json.loads(db_api_org.json),
                                             version=pure_api_config.version)
                db_org = get_db_org(session, db_api_org.uuid)
                if db_org:
                    if db_org.pure_modified and db_org.pure_modified >= db_api_org.modified:
                        # Skip this record, since we already have a newer one:
                        processed_api_org_uuids.append(db_api_org.uuid)
                        continue
                else:
                    db_org = create_db_org(api_org)

                db_org.name_en = next((name_text.value
                                       for name_text in api_org.name.text
                                       if name_text.locale == 'en_US'), None)

                db_org.type = next(
                    (type_text.value for type_text in api_org.type.term.text
                     if type_text.locale == 'en_US'), None).lower()

                db_org.pure_modified = db_api_org.modified
                session.add(db_org)

                processed_api_org_uuids.append(api_org.uuid)
                if len(processed_api_org_uuids) >= transaction_record_limit:
                    mark_api_orgs_as_processed(session, pure_api_record_logger,
                                               processed_api_org_uuids)
                    processed_api_org_uuids = []
                    session.commit()

            mark_api_orgs_as_processed(session, pure_api_record_logger,
                                       processed_api_org_uuids)
            session.commit()

    except Exception as e:
        formatted_exception = loggers.format_exception(e)
        experts_etl_logger.error(
            f'exception encountered during record transformation: {formatted_exception}',
            extra={
                'pure_uuid': api_org.uuid,
                'pure_api_record_type': pure_api_record_type
            })

    loggers.rollover(pure_api_record_logger)
    experts_etl_logger.info(
        'ending: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})
Пример #11
0
def run(
        # Do we need other default functions here?
        extract_api_persons=extract_api_persons,
        db_name=db_name,
        transaction_record_limit=transaction_record_limit,
        pure_api_record_logger=pure_api_record_logger,
        experts_etl_logger=None,
        pure_api_config=None):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info(
        'starting: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})

    if pure_api_config is None:
        pure_api_config = Config()

    # Capture the current record for each iteration, so we can log it in case of an exception:
    api_person = None

    try:
        with db.session(db_name) as session:
            processed_api_person_uuids = []
            for db_api_person in extract_api_persons(session):
                api_person = response.transform(
                    pure_api_record_type,
                    json.loads(db_api_person.json),
                    version=pure_api_config.version)

                person_ids = get_person_ids(api_person)
                # Not sure that we should be requiring either of these in EDW, but we do for now,
                # in umn_person_pure_org, at least:
                if person_ids['emplid'] is None:
                    experts_etl_logger.info(
                        'skipping updates: missing emplid.',
                        extra={
                            'pure_uuid': api_person.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue
                if api_person.externalId is None:
                    experts_etl_logger.info(
                        'skipping updates: missing pure id.',
                        extra={
                            'pure_uuid': api_person.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue

                db_person = get_db_person(session, person_ids['emplid'])
                db_person_previously_existed = False
                if db_person:
                    db_person_previously_existed = True
                    if db_person.pure_modified and db_person.pure_modified >= db_api_person.modified:
                        # Skip this record, since we already have a newer one:
                        processed_api_person_uuids.append(db_api_person.uuid)
                        continue
                else:
                    db_person = create_db_person(person_ids['emplid'])

                # All internal persons should have this. Usually it will be the same as the emplid,
                # but sometimes the old SciVal identifier.
                db_person.pure_id = api_person.externalId

                db_person.pure_uuid = api_person.uuid
                db_person.internet_id = person_ids['internet_id']
                db_person.first_name = api_person.name.firstName
                db_person.last_name = api_person.name.lastName
                db_person.orcid = api_person.orcid
                db_person.hindex = api_person.scopusHIndex
                db_person.pure_modified = db_api_person.modified

                # TODO:
                # 2. Can internal persons have externalOrganisationAssociations (sp?)?
                #    Not sure yet, but they can have honoraryStaffOrganisationAssociations!
                # 3. If a person has an association with on org not in EDW yet, skip that person...
                #    or should we just immediately attempt to fetch the org and update EDW?
                #    For now, we'll skip that person.

                # Check for orgs not in EDW yet:

                api_org_uuids = set()
                for org_assoc in api_person.staffOrganisationAssociations:
                    api_org_uuids.add(org_assoc.organisationalUnit.uuid)

                db_org_uuids = set()
                if db_person_previously_existed:
                    # Avoid trying to query a person that doesn't exist in the db yet:
                    db_org_uuids = {
                        db_org.pure_uuid
                        for db_org in db_person.pure_orgs
                    }

                api_only_org_uuids = api_org_uuids - db_org_uuids
                db_only_org_uuids = db_org_uuids - api_org_uuids

                # For now, skip this person if there are any orgs referenced in the api record
                # that we don't have in EDW:
                if len(api_only_org_uuids) > 0:
                    api_only_orgs_in_db = session.query(PureOrg).filter(
                        PureOrg.pure_uuid.in_(api_only_org_uuids)).all()
                    if len(api_only_org_uuids) > len(api_only_orgs_in_db):
                        experts_etl_logger.info(
                            'skipping updates: some associated orgs do not exist in EDW.',
                            extra={
                                'pure_uuid': api_person.uuid,
                                'pure_api_record_type': pure_api_record_type
                            })
                        continue

                ## umn person pure orgs aka staff organisation associations aka jobs

                # TODO: We may encounter duplicate jobs that break our uniqueness constraints.

                found_missing_job_description = False
                all_umn_person_pure_org_primary_keys = set()
                umn_person_pure_orgs = []
                for org_assoc in api_person.staffOrganisationAssociations:
                    job_description = next(
                        (job_description_text.value for job_description_text in
                         org_assoc.jobDescription.text
                         if job_description_text.locale == 'en_US'), None)
                    if job_description is None:
                        found_missing_job_description = True
                        break

                    # Due to transitioning from master list to xml syncs of jobs, we may encounter duplicates.
                    # This may also happen due to manual entering of jobs in the Pure UI.
                    umn_person_pure_org_primary_keys = frozenset([
                        'person_uuid:' + db_person.uuid,
                        'pure_org_uuid:' + org_assoc.organisationalUnit.uuid,
                        'job_description:' + job_description,
                        'start_date:' + org_assoc.period.startDate,
                    ])
                    if umn_person_pure_org_primary_keys in all_umn_person_pure_org_primary_keys:
                        experts_etl_logger.info(
                            'duplicate job found',
                            extra={
                                'umn_person_pure_org_primary_keys':
                                umn_person_pure_org_primary_keys,
                                'pure_uuid': api_person.uuid,
                                'pure_api_record_type': pure_api_record_type,
                            })
                        continue
                    all_umn_person_pure_org_primary_keys.add(
                        umn_person_pure_org_primary_keys)

                    umn_person_pure_org = UmnPersonPureOrg(
                        pure_org_uuid=org_assoc.organisationalUnit.uuid,
                        person_uuid=db_person.uuid,
                        pure_person_id=db_person.pure_id,
                        emplid=db_person.emplid,
                        # Skipping this for now:
                        pure_org_id=None,
                        job_description=job_description,

                        # Note: Both employmentType and staffType may be missing because they are not required
                        # fields in the Pure UI, which UMN staff sometimes use to enter jobs not in PeopleSoft.

                        # Called employed_as in EDW, which was all 'Academic' as of 2018-06-05.
                        # Probably due to an artifact of the old master list upload process, or a
                        # misunderstanding of it. The newer EDW tables for upload (sync) to Pure
                        # have similar values in default_employed_as, but they're the last segment of the
                        # employmentType uri.
                        employed_as=next(
                            (employment_type_text.value
                             for employment_type_text in
                             org_assoc.employmentType.term.text
                             if employment_type_text.locale == 'en_US'), None),

                        # Sometimes staffType will be 'non-academic', but we allowed space in EDW
                        # only for 'nonacademic' (without a hyphen):
                        staff_type=re.sub(
                            '[^a-zA-Z]+', '',
                            next((staff_type_text.value for staff_type_text in
                                  org_assoc.staffType.term.text
                                  if staff_type_text.locale == 'en_US'),
                                 None).lower()),
                        start_date=isoparse(org_assoc.period.startDate),
                        end_date=isoparse(org_assoc.period.endDate)
                        if org_assoc.period.endDate else None,
                        primary='Y'
                        if org_assoc.isPrimaryAssociation == True else 'N',
                    )
                    umn_person_pure_orgs.append(umn_person_pure_org)

                if found_missing_job_description:
                    experts_etl_logger.info(
                        'skipping updates: one or more org associations are missing job descriptions',
                        extra={
                            'pure_uuid': api_person.uuid,
                            'pure_api_record_type': pure_api_record_type
                        })
                    continue

                # Now we can add the person to the session, because there are no other
                # reasons for intentionally skipping it:
                session.add(db_person)

                # Now we can add the umn person pure orgs (i.e., jobs) to the session, too.
                # These are so complex, it's easiest to just delete and re-create them:
                session.query(UmnPersonPureOrg).filter(
                    UmnPersonPureOrg.person_uuid == db_person.uuid).delete()
                for umn_person_pure_org in umn_person_pure_orgs:
                    session.add(umn_person_pure_org)

                ## person pure orgs

                for org_uuid in api_only_org_uuids:
                    person_pure_org = PersonPureOrg(
                        person_uuid=db_person.uuid,
                        pure_org_uuid=org_uuid,
                    )
                    session.add(person_pure_org)

                session.query(PersonPureOrg).filter(
                    PersonPureOrg.person_uuid == db_person.uuid,
                    PersonPureOrg.pure_org_uuid.in_(db_only_org_uuids)).delete(
                        synchronize_session=False)

                ## scopus ids

                db_scopus_ids = set()
                if db_person_previously_existed:
                    # Avoid trying to query a person that doesn't exist in the db yet:
                    db_scopus_ids = set(db_person.scopus_ids)
                api_only_scopus_ids = person_ids['scopus_ids'] - db_scopus_ids
                db_only_scopus_ids = db_scopus_ids - person_ids['scopus_ids']

                for scopus_id in api_only_scopus_ids:
                    person_scopus_id = PersonScopusId(
                        person_uuid=db_person.uuid,
                        scopus_id=scopus_id,
                    )
                    session.add(person_scopus_id)

                session.query(PersonScopusId).filter(
                    PersonScopusId.person_uuid == db_person.uuid,
                    PersonScopusId.scopus_id.in_(db_only_scopus_ids)).delete(
                        synchronize_session=False)

                processed_api_person_uuids.append(api_person.uuid)
                if len(processed_api_person_uuids) >= transaction_record_limit:
                    mark_api_persons_as_processed(session,
                                                  pure_api_record_logger,
                                                  processed_api_person_uuids)
                    processed_api_person_uuids = []
                    session.commit()

            mark_api_persons_as_processed(session, pure_api_record_logger,
                                          processed_api_person_uuids)
            session.commit()

    except Exception as e:
        formatted_exception = loggers.format_exception(e)
        experts_etl_logger.error(
            f'exception encountered during record transformation: {formatted_exception}',
            extra={
                'pure_uuid': api_person.uuid,
                'pure_api_record_type': pure_api_record_type
            })

    loggers.rollover(pure_api_record_logger)
    experts_etl_logger.info(
        'ending: transforming/loading',
        extra={'pure_api_record_type': pure_api_record_type})
Пример #12
0
import dotenv_switch.auto

import importlib
import multiprocessing as mp
import os
import time

from experts_dw import db, pure_json
from experts_etl import loggers, sync_file_rotator
from experts_etl.pure_to_edw import changes, collection

experts_etl_logger = loggers.experts_etl_logger()
default_interval = 14400  # 4 hours, in seconds

extractor_loaders = list(
    map(lambda x: 'experts_etl.extractor_loaders.' + x, [
        'pure_api_changes',
        'pure_api_external_organisations',
        'pure_api_organisational_units',
        'pure_api_external_persons',
        'pure_api_persons',
        'pure_api_research_outputs',
    ]))

transformer_loaders = list(
    map(lambda x: 'experts_etl.transformer_loaders.' + x, [
        'pure_api_external_org',
        'pure_api_internal_org',
        'pure_api_external_person',
        'pure_api_internal_person',
        'pure_api_pub',
Пример #13
0
def run(
        collection_api_name,
        startdate=None,
        startdate_str=None,  # yyyy-MM-dd or yyyy-MM-dd_HH-mm-ss format
        transaction_record_limit=transaction_record_limit,
        experts_etl_logger=None,
        api_version=None):
    if experts_etl_logger is None:
        experts_etl_logger = loggers.experts_etl_logger()
    experts_etl_logger.info(
        'starting: extracting/loading raw json',
        extra={'pure_api_record_type': collection_api_name})

    client_config = client.Config() if api_version is None else client.Config(
        version=api_version)
    api_version = client_config.version

    with db.cx_oracle_connection() as connection:
        try:
            cursor = connection.cursor()

            pure_json.process_changes_matching_previous_uuids(
                cursor,
                collection_api_name=collection_api_name,
                api_version=api_version)

            uuids = pure_json.distinct_change_uuids_for_collection(
                cursor,
                collection_api_name=collection_api_name,
                api_version=api_version)
            uuids_in_pure = []
            documents_to_insert = {}
            for api_document in client.filter_all_by_uuid_transformed(
                    collection_api_name, uuids=uuids, config=client_config):
                uuids_in_pure.append(api_document.uuid)

                if pure_json.document_exists(
                        cursor,
                        uuid=api_document.uuid,
                        collection_api_name=collection_api_name,
                        api_version=api_version,
                        staging=True):
                    continue

                documents_to_insert[
                    f'{api_document.uuid}:{api_document.version}'] = {
                        'uuid':
                        api_document.uuid,
                        'pure_created':
                        datetime.strptime(api_document.info.createdDate,
                                          iso_8601_format),
                        'pure_modified':
                        datetime.strptime(api_document.info.modifiedDate,
                                          iso_8601_format),
                        'inserted':
                        datetime.now(),
                        'updated':
                        datetime.now(),
                        'json_document':
                        json.dumps(api_document),
                    }

                if len(documents_to_insert) % transaction_record_limit == 0:
                    pure_json.insert_documents(
                        cursor,
                        documents=list(documents_to_insert.values()),
                        collection_api_name=collection_api_name,
                        staging=True,
                        api_version=api_version)
                    connection.commit()
                    documents_to_insert = {}

            if documents_to_insert:
                pure_json.insert_documents(
                    cursor,
                    documents=list(documents_to_insert.values()),
                    collection_api_name=collection_api_name,
                    staging=True,
                    api_version=api_version)
            connection.commit()

            pure_json.process_changes_matching_staging(
                cursor,
                collection_api_name=collection_api_name,
                api_version=api_version)

            missing_uuids = list(
                (Counter(uuids) - Counter(uuids_in_pure)).elements())
            for missing_uuids_sublist in [
                    missing_uuids[i:i + 100]
                    for i in range(0, len(missing_uuids), 100)
            ]:
                pure_json.delete_documents_and_changes_matching_uuids(
                    cursor,
                    uuids=missing_uuids_sublist,
                    collection_api_name=collection_api_name,
                    api_version=api_version)

            pure_json.load_documents_from_staging(
                cursor,
                collection_api_name=collection_api_name,
                api_version=api_version)

            pure_json.process_changes_matching_previous_uuids(
                cursor,
                collection_api_name=collection_api_name,
                api_version=api_version)

        except Exception as e:
            connection.rollback()
            formatted_exception = loggers.format_exception(e)
            experts_etl_logger.error(
                f'exception encountered during extractin/loading raw json: {formatted_exception}',
                extra={'pure_api_record_type': collection_api_name})

    experts_etl_logger.info(
        'ending: extracting/loading raw json',
        extra={'pure_api_record_type': collection_api_name})