Exemplo n.º 1
0
def main(output: TextIO, rome: str = DEFAULT_ROME_OPTIONS) -> None:
    """Retrieve a list of skills from MatchviaSoftSkills API.
    https://www.emploi-store-dev.fr/portail-developpeur-cms/home/catalogue-des-api/documentation-des-api/api-matchviasoftskills-v1.html
    """

    if not _EMPLOI_STORE_DEV_CLIENT_ID or not _EMPLOI_STORE_DEV_SECRET:
        logging.warning('Missing Emploi Store Dev identifiers.')
        return

    client = emploi_store.Client(client_id=_EMPLOI_STORE_DEV_CLIENT_ID,
                                 client_secret=_EMPLOI_STORE_DEV_SECRET)

    if rome == 'all':
        romes = list(cleaned_data.rome_job_groups().index)
    else:
        romes = [rome]

    if not romes:
        logging.warning('Missing job group identifiers.')
        return

    writer = csv.DictWriter(output, fieldnames=_SoftSkill._fields)
    writer.writeheader()

    for rome in romes:
        try:
            skills = client.get_match_via_soft_skills(rome=rome)

            for fields in _create_skill_csv_lines(rome, skills):
                writer.writerow(fields._asdict())
        except (IOError) as error:
            logging.error(
                'Error while calling MatchviaSoftSkills API: %s\nJob group: %s',
                error, rome)
            return
Exemplo n.º 2
0
def csv2dicts(rome_csv_pattern):
    """Import the ROME mobility data in MongoDB.

    We group the mobility data as JobGroups (we find a set of similar jobs
    either for a specific job or for a job group).

    To get all mobility data for a given job, you have to to look both for the
    data for this job (keyed by OGR code) and for the data for its job group
    (keyed by ROME code). As OGR code and ROME code use different namespaces
    there's no conflict to use it directly with its key.

    Args:
        rome_csv_pattern: pattern of paths to CSV files containing the ROME
            data. It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'rubrique_mobilite' and
            'referentiel_appellation'.
    """

    mobility = pandas.read_csv(rome_csv_pattern.format('rubrique_mobilite'),
                               dtype=str)
    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))

    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    jobs.index.name = 'codeOgr'
    masculine_job_names, feminine_job_names = (rome_genderization.genderize(
        jobs.name))
    jobs['masculineName'] = masculine_job_names
    jobs['feminineName'] = feminine_job_names
    jobs_names = jobs.name
    jobs.reset_index(inplace=True)
    jobs_samples = jobs.groupby('code_rome').apply(_sample_jobs(3))

    mobility.rename(columns={
        'code_rome': 'source_job_group',
        'code_appellation_source': 'source_job',
        'code_rome_cible': 'target_job_group',
        'code_appellation_cible': 'target_job',
        'code_type_mobilite': 'mobility_type',
    },
                    inplace=True)

    mobility['target_job_group_name'] = (mobility.target_job_group.map(
        job_groups.name))
    mobility.target_job_group_name.fillna('', inplace=True)
    mobility['target_job_group_samples'] = (
        mobility.target_job_group.map(jobs_samples).fillna(False))
    mobility['target_job_name'] = mobility.target_job.map(jobs_names)
    mobility.target_job_name.fillna('', inplace=True)
    mobility['target_job_masculine_name'] = mobility.target_job.map(
        masculine_job_names)
    mobility.target_job_masculine_name.fillna('', inplace=True)
    mobility['target_job_feminine_name'] = mobility.target_job.map(
        feminine_job_names)
    mobility.target_job_feminine_name.fillna('', inplace=True)

    return dataframe2dicts(mobility)
def get_data_zones(data_folder='data'):
    """Read the data zone files and massage them in the right format.

    More info can be found in notebooks/research/evaluation/unverified_data_zones.ipynb
    """

    file1_path = path.join(data_folder,
                           'unverified_data_zones/assign_mm_ale0.csv')
    file2_path = path.join(data_folder,
                           'unverified_data_zones/assign_mm_aleNOT0.csv')
    file3_path = path.join(data_folder,
                           'unverified_data_zones/Code_Postaux_Bloques.csv')

    file1 = pandas.read_csv(file1_path)
    file1 = file1[['rome', 'codepostal']]
    file1.columns = ['rome_id', 'postcode']

    file2 = pandas.read_csv(file2_path)
    file2 = file2[['rome', 'codepostal']]
    file2.columns = ['rome_id', 'postcode']

    data_zones = pandas.concat([file1, file2])

    file3 = pandas.read_csv(file3_path)
    file3.columns = ['postcodes', 'postcode']

    city_stats = cleaned_data.french_city_stats(data_folder)
    city_stats = city_stats[~city_stats.city_id.
                            isin(['13055', '75056', '69123'])]
    postcode_to_range_mapping = {}
    for zip_codes in city_stats.zipCode:
        for zip_code in zip_codes.split('-'):
            postcode_to_range_mapping[zip_code] = zip_codes

    job_groups = cleaned_data.rome_job_groups(data_folder)
    rome_ids = job_groups.reset_index()
    rome_ids['merge_id'] = 1
    rome_ids = rome_ids[['code_rome', 'merge_id']]
    rome_ids.columns = ['rome_id', 'merge_id']
    file3['merge_id'] = 1
    outer_product = pandas.merge(file3, rome_ids, how='outer', on=['merge_id'])

    massaged_file3 = outer_product[['rome_id', 'postcode']]
    data_zones = pandas.concat([data_zones, massaged_file3])

    padded_postcodes = data_zones.postcode.astype(str).str.pad(5, 'left', '0')
    data_zones['postcodes'] = padded_postcodes.map(postcode_to_range_mapping)
    data_zones.drop_duplicates(['rome_id', 'postcodes'], inplace=True)
    return data_zones
Exemplo n.º 4
0
def make_dicts(
        rome_csv_pattern,
        job_requirements_json,
        job_application_complexity_json,
        application_mode_csv,
        rome_fap_crosswalk_txt,
        handcrafted_assets_airtable,
        domains_airtable,
        info_by_prefix_airtable,
        fap_growth_2012_2022_csv):
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
        application_mode_csv: path to a CSV file containing the application mode
            data from emploi-store-dev API.
        rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk
            from FAP codes to ROME job group codes.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
        domains_airtable: the base ID and the table name joined by a ':' of the
            AirTable containing the domain name for each sector.
        info_by_prefix_airtable: the base ID and the table name joined by a ':'
            of the AirTable containing some manually specified info for group of
            job group (by ROME ID prefix).
        fap_growth_2012_2022_csv: path to a CSV file containing the growth of
            FAP job groups for the period 2012-2022.
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """

    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))
    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern.format('referentiel_code_rome_riasec'))
    rome_texts = cleaned_data.rome_texts(
        filename=rome_csv_pattern.format('texte'))
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern.format('liens_rome_referentiels'),
        ref_filename=rome_csv_pattern.format('referentiel_env_travail'))
    handcrafted_assets = _load_assets_from_airtable(*handcrafted_assets_airtable.split(':'))
    sector_domains = _load_domains_from_airtable(*domains_airtable.split(':'))
    info_by_prefix = _load_prefix_info_from_airtable(
        job_groups.index, *info_by_prefix_airtable.split(':'))
    application_modes = _get_application_modes(
        application_mode_csv, rome_fap_crosswalk_txt)
    fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv)

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None))
    job_groups['jobs'] = job_groups.jobs.apply(
        lambda s: s if isinstance(s, list) else [])

    # Add info by prefix.
    job_groups = job_groups.join(info_by_prefix)

    # Add skills.
    rome_to_skills = cleaned_data.rome_to_skills(
        filename_items=rome_csv_pattern.format('coherence_item'),
        filename_skills=rome_csv_pattern.format('referentiel_competence'))
    skills_grouped = rome_to_skills.groupby('code_rome')
    job_groups['requirements'] = skills_grouped.apply(
        _group_skills_as_proto_list)
    # Replace NaN by empty dicts.
    job_groups['requirements'] = job_groups.requirements.apply(
        lambda r: r if isinstance(r, dict) else {})

    # Combine requirements from json file.
    with open(job_requirements_json) as job_requirements_file:
        job_requirements_list = json.load(job_requirements_file)
        job_requirements_dict = {
            job_requirement.pop('_id'): job_requirement
            for job_requirement in job_requirements_list}
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            job_requirements_dict.get(job_group.Index, {}))

    # Combine requirements from AirTable.
    for job_group in job_groups.itertuples():
        job_group.requirements.update(handcrafted_assets.get(job_group.Index, {}))

    application_complexity = pandas.read_json(job_application_complexity_json)
    application_complexity.set_index('_id', inplace=True)
    job_groups['applicationComplexity'] = application_complexity['applicationComplexity']
    job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True)

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups.hollandCodeMajor.fillna('', inplace=True)
    job_groups['hollandCodeMinor'] = holland_codes.minor
    job_groups.hollandCodeMinor.fillna('', inplace=True)

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups.description.fillna('', inplace=True)
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups.workingEnvironment.fillna('', inplace=True)
    job_groups['requirementsText'] = rome_texts.requirements
    job_groups.requirementsText.fillna('', inplace=True)

    # Add work environment items.
    rome_work_environments['domain'] = rome_work_environments['name'].map(sector_domains)
    job_groups['workEnvironmentKeywords'] = \
        rome_work_environments.groupby('code_rome').apply(_group_work_environment_items)
    # Fill NaN with empty {}.
    job_groups['workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply(
        lambda k: k if isinstance(k, dict) else {})

    # Add application modes.
    job_groups['applicationModes'] = application_modes
    job_groups['applicationModes'] = job_groups.applicationModes.apply(
        lambda m: m if isinstance(m, dict) else {})

    # Add growth for the 2012-2022 period.
    job_groups['growth20122022'] = _get_growth_2012_2022(
        fap_growth_2012_2022, rome_fap_crosswalk_txt)
    job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001
    job_groups['growth20122022'].fillna(0, inplace=True)

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return job_groups.to_dict('records')
Exemplo n.º 5
0
def make_dicts(
        rome_csv_pattern: str,
        job_requirements_json: str,
        job_application_complexity_json: str,
        application_mode_csv: str,
        rome_fap_crosswalk_txt: str,
        handcrafted_assets_airtable: str,
        domains_airtable: str,
        strict_diplomas_airtable: str,
        info_by_prefix_airtable: str,
        fap_growth_2012_2022_csv: str,
        imt_market_score_csv: str,
        jobboards_airtable: Optional[str] = None,
        skills_for_future_airtable: Optional[str] = None,
        specific_to_job_airtable: Optional[str] = None) \
        -> List[Dict[str, Any]]:
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
        application_mode_csv: path to a CSV file containing the application mode
            data from emploi-store-dev API.
        rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk
            from FAP codes to ROME job group codes.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
        domains_airtable: the base ID and the table name joined by a ':' of the
            AirTable containing the domain name for each sector.
        strict_diplomas_airtable: the base ID and the table name joined by a ':' of the
            AirTable which tells if a diploma is strictly required.
        info_by_prefix_airtable: the base ID and the table name joined by a ':'
            of the AirTable containing some manually specified info for group of
            job group (by ROME ID prefix).
        fap_growth_2012_2022_csv: path to a CSV file containing the growth of
            FAP job groups for the period 2012-2022.
        imt_market_score_csv: path to a CSV containing market score info from IMT.
        jobboards_airtable: the base ID and the table name joined by a ':' of the Airtable of the
            job boards.
        skills_for_future_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the skills for the future.
        specific_to_job_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the specific to job pieces advice.
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """

    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))
    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern.format('referentiel_code_rome_riasec'))
    rome_texts = cleaned_data.rome_texts(
        filename=rome_csv_pattern.format('texte'))
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern.format('liens_rome_referentiels'),
        ref_filename=rome_csv_pattern.format('referentiel_env_travail'))
    handcrafted_assets = _load_assets_from_airtable(
        *handcrafted_assets_airtable.split(':'))
    sector_domains = _load_domains_from_airtable(*domains_airtable.split(':'))
    info_by_prefix = _load_prefix_info_from_airtable(
        job_groups.index, *info_by_prefix_airtable.split(':'))
    application_modes = _get_application_modes(application_mode_csv,
                                               rome_fap_crosswalk_txt)
    fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv)
    jobboards_by_rome = _load_items_from_airtable('JobBoard', job_groups.index,
                                                  jobboards_airtable,
                                                  'for-job-group')
    skills_for_future_by_rome = _load_items_from_airtable(
        'Skill', job_groups.index, skills_for_future_airtable, 'rome_prefixes')
    specific_to_job_by_rome = _load_items_from_airtable(
        'DynamicAdvice', job_groups.index, specific_to_job_airtable,
        'for-job-group')
    users_highest_degrees = _load_highest_degrees_from_mongo()

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None))
    job_groups['jobs'] = job_groups.jobs.apply(lambda s: s
                                               if isinstance(s, list) else [])

    # Add info by prefix.
    job_groups = job_groups.join(info_by_prefix)

    # Combine requirements from json file.
    with open(job_requirements_json) as job_requirements_file:
        job_requirements_list = json.load(job_requirements_file)
        job_requirements_dict = {
            job_requirement.pop('_id'): job_requirement
            for job_requirement in job_requirements_list
        }
    job_groups['requirements'] = job_groups.index.map(job_requirements_dict)
    # Replace NaN by empty dicts.
    job_groups['requirements'] = job_groups.requirements.apply(
        lambda r: r if isinstance(r, dict) else {})

    # Combine requirements from AirTable.
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            handcrafted_assets.get(job_group.Index, {}))

    application_complexity = pandas.read_json(job_application_complexity_json)
    application_complexity.set_index('_id', inplace=True)
    job_groups['applicationComplexity'] = application_complexity[
        'applicationComplexity']
    job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY',
                                            inplace=True)

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups.hollandCodeMajor.fillna('', inplace=True)
    job_groups['hollandCodeMinor'] = holland_codes.minor
    job_groups.hollandCodeMinor.fillna('', inplace=True)

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups.description.fillna('', inplace=True)
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups.workingEnvironment.fillna('', inplace=True)
    job_groups['requirementsText'] = rome_texts.requirements
    job_groups.requirementsText.fillna('', inplace=True)

    # Add work environment items.
    rome_work_environments['domain'] = rome_work_environments['name'].map(
        sector_domains)
    job_groups['workEnvironmentKeywords'] = \
        rome_work_environments.groupby('code_rome').apply(_group_work_environment_items)
    # Fill NaN with empty {}.
    job_groups[
        'workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply(
            lambda k: k if isinstance(k, dict) else {})

    # Add application modes.
    job_groups['applicationModes'] = application_modes
    job_groups['applicationModes'] = job_groups.applicationModes.apply(
        lambda m: m if isinstance(m, dict) else {})

    # Add growth for the 2012-2022 period.
    job_groups['growth20122022'] = _get_growth_2012_2022(
        fap_growth_2012_2022, rome_fap_crosswalk_txt)
    job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001
    job_groups['growth20122022'].fillna(0, inplace=True)

    # Add best departements.
    job_groups['departementScores'] = _get_less_stressful_departements_count(
        imt_market_score_csv)
    # Fill NaN with empty [].
    job_groups['departementScores'] = job_groups.departementScores.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['bestDepartements'] = job_groups.departementScores.apply(
        lambda ds: ds[:11])

    # Add national market score.
    job_groups['nationalMarketScore'] = _get_national_market_scores(
        imt_market_score_csv)
    job_groups['nationalMarketScore'].fillna(0, inplace=True)

    # Add diploma requirements.
    job_groups[
        'is_diploma_strictly_required'] = _load_strict_diplomas_from_airtable(
            *strict_diplomas_airtable.split(':'))
    job_groups['is_diploma_strictly_required'].fillna(False, inplace=True)

    # Add job_boards.
    if jobboards_by_rome:
        job_groups['jobBoards'] = job_groups.index.map(jobboards_by_rome)

    # Add skills for the future.
    if skills_for_future_by_rome:
        job_groups['skillsForFuture'] = job_groups.index.map(
            skills_for_future_by_rome)

    # Add specific to job advice.
    if specific_to_job_by_rome:
        job_groups['specificAdvice'] = job_groups.index.map(
            specific_to_job_by_rome)

    # Add highest degree counts from user base.
    if users_highest_degrees is not None:
        job_groups['userDegrees'] = users_highest_degrees
        # Fill NaN with empty [].
        job_groups['userDegrees'] = job_groups.userDegrees.apply(
            lambda d: d if isinstance(d, list) else [])

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return typing.cast(List[Dict[str, Any]], job_groups.to_dict('records'))
Exemplo n.º 6
0
def make_dicts(
        *,
        rome_csv_pattern: str,
        application_mode_csv: Optional[str] = None,
        brookings_json: Optional[str] = None,
        domains_airtable: Optional[str] = None,
        fap_growth_2012_2022_csv: Optional[str] = None,
        handcrafted_assets_airtable: Optional[str] = None,
        imt_market_score_csv: Optional[str] = None,
        info_by_prefix_airtable: Optional[str] = None,
        jobboards_airtable: Optional[str] = None,
        job_application_complexity_json: Optional[str] = None,
        job_requirements_json: Optional[str] = None,
        rome_fap_crosswalk_txt: Optional[str] = None,
        rome_isco_crosswalk_xlsx: Optional[str] = None,
        skills_for_future_airtable: Optional[str] = None,
        soc_2010_xls: Optional[str] = None,
        soc_isco_crosswalk_xls: Optional[str] = None,
        specific_to_job_airtable: Optional[str] = None,
        strict_diplomas_airtable: Optional[str] = None,
        trainings_csv: Optional[str] = None,
        sampler_generator: Callable[[Optional[int]], Sampler] = _create_jobs_sampler) \
        -> list[dict[str, Any]]:
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
                For `requirements`.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
                For `application_complexity`.
        application_mode_csv: path to a CSV file containing the application mode
            data from emploi-store-dev API.
                For `application_modes`.
        rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk
            from FAP codes to ROME job group codes.
                For `application_modes`, `growth_2012_2022`.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
                For `requirements`.
        domains_airtable: the base ID and the table name joined by a ':' of the
            AirTable containing the domain name for each sector.
                For `work_environment_keywords`.
        strict_diplomas_airtable: the base ID and the table name joined by a ':' of the
            AirTable which tells if a diploma is strictly required.
                For `is_diploma_strictly_required`.
        info_by_prefix_airtable: the base ID and the table name joined by a ':'
                For `covidRisk`, `domain`, `hasFreelancers`, `inAWorkplace`, `inDomain`,
                `likeYourWorkplace`, `placePlural`, `preferredApplicationMedium`, `whatILoveAbout`,
                `toTheWorkplace`, `whySpecificCompany`, `atVariousCompanies`,
                `whatILoveAboutFeminine`.
            of the AirTable containing some manually specified info for group of
            job group (by ROME ID prefix).
        fap_growth_2012_2022_csv: path to a CSV file containing the growth of
            FAP job groups for the period 2012-2022.
                For `growth_2012_2022`.
        imt_market_score_csv: path to a CSV containing market score info from IMT.
                For `best_departements`, `departement_scores`, `nationam_market_score`.
        jobboards_airtable: the base ID and the table name joined by a ':' of the Airtable of the
            job boards.
                For `job_boards`.
        skills_for_future_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the skills for the future.
                For `skills_for_future`.
        specific_to_job_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the specific to job pieces advice.
                For `specific_advice`
        brookings_json: path to a JSON file with data from Brookings report for automation risk.
                For `automation_risk`.
        soc_2010_xls: path to an XLS file with the names of US SOC 2010 groups.
                For `automation_risk`.
        soc_isco_crosswalk_xls: path to an XLS file of the crosswalk btw US SOC 2010 and ISCO-08.
                For `automation_risk`.
        rome_isco_crosswalk_xlsx: path to an XLSX file of the crosswalk btw ROME and ISCO-08.
                For `automation_risk`.
        trainings_csv: path to a CSV with trainings data.
                For `training_count`.
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """

    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))
    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern.format('referentiel_code_rome_riasec'))
    rome_texts = cleaned_data.rome_texts(
        filename=rome_csv_pattern.format('texte'))
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern.format('liens_rome_referentiels'),
        ref_filename=rome_csv_pattern.format('referentiel_env_travail'))
    handcrafted_assets = _load_assets_from_airtable(*handcrafted_assets_airtable.split(':')) \
        if handcrafted_assets_airtable else {}
    sector_domains = _load_domains_from_airtable(*domains_airtable.split(':')) \
        if domains_airtable else {}
    info_by_prefix = _load_prefix_info_from_airtable(job_groups.index, info_by_prefix_airtable) \
        if info_by_prefix_airtable else None
    application_modes = _get_application_modes(
        application_mode_csv, rome_fap_crosswalk_txt,
    ) if application_mode_csv and rome_fap_crosswalk_txt else None
    fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv) if fap_growth_2012_2022_csv \
        else None
    jobboards_by_rome = airtable_to_protos.load_items_from_prefix(
        'JobBoard', job_groups.index, jobboards_airtable, 'for-job-group',
    ) if jobboards_airtable else None
    skills_for_future_by_rome = airtable_to_protos.load_items_from_prefix(
        'Skill', job_groups.index, skills_for_future_airtable, 'rome_prefixes',
    ) if skills_for_future_airtable else None
    specific_to_job_by_rome = airtable_to_protos.load_items_from_prefix(
        'DynamicAdvice', job_groups.index, specific_to_job_airtable, 'fr:for-job-group',
    ) if specific_to_job_airtable else None
    users_highest_degrees = _load_highest_degrees_from_mongo()

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(sampler_generator(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(sampler_generator(None))
    job_groups['jobs'] = job_groups.jobs.apply(
        lambda s: s if isinstance(s, list) else [])

    # Add info by prefix.
    if info_by_prefix is not None:
        job_groups = job_groups.join(info_by_prefix)

    # Combine requirements from json file.
    if job_requirements_json:
        with open(job_requirements_json, encoding='utf-8') as job_requirements_file:
            job_requirements_list = json.load(job_requirements_file)
            job_requirements_dict = {
                job_requirement.pop('_id'): job_requirement
                for job_requirement in job_requirements_list}
        job_groups['requirements'] = job_groups.index.map(job_requirements_dict)
        # Replace NaN by empty dicts.
        job_groups['requirements'] = job_groups.requirements.apply(
            lambda r: r if isinstance(r, dict) else {})

        # Combine requirements from AirTable.
        if handcrafted_assets:
            for job_group in job_groups.itertuples():
                job_group.requirements.update(handcrafted_assets.get(job_group.Index, {}))

    if job_application_complexity_json:
        application_complexity = pandas.read_json(job_application_complexity_json)
        application_complexity.set_index('_id', inplace=True)
        job_groups['applicationComplexity'] = application_complexity['applicationComplexity']
        job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True)

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups.hollandCodeMajor.fillna('', inplace=True)
    job_groups['hollandCodeMinor'] = holland_codes.minor
    job_groups.hollandCodeMinor.fillna('', inplace=True)

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups.description.fillna('', inplace=True)
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups.workingEnvironment.fillna('', inplace=True)
    job_groups['requirementsText'] = rome_texts.requirements
    job_groups.requirementsText.fillna('', inplace=True)

    # Add work environment items.
    if sector_domains:
        rome_work_environments['domain'] = rome_work_environments['name'].map(sector_domains)
        job_groups['workEnvironmentKeywords'] = \
            rome_work_environments.groupby('code_rome').apply(_group_work_environment_items)
        # Fill NaN with empty {}.
        job_groups['workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply(
            lambda k: k if isinstance(k, dict) else {})

    # Add application modes.
    if application_modes is not None:
        job_groups['applicationModes'] = application_modes
        job_groups['applicationModes'] = job_groups.applicationModes.apply(
            lambda m: m if isinstance(m, dict) else {})

    # Add growth for the 2012-2022 period.
    if fap_growth_2012_2022 is not None and rome_fap_crosswalk_txt:
        job_groups['growth20122022'] = _get_growth_2012_2022(
            fap_growth_2012_2022, rome_fap_crosswalk_txt)
        job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001
        job_groups['growth20122022'].fillna(0, inplace=True)

    # Add automation risk.
    if brookings_json and soc_2010_xls and soc_isco_crosswalk_xls and rome_isco_crosswalk_xlsx:
        job_groups['automationRisk'] = _get_automation_risk(
            brookings_json=brookings_json,
            soc_2010_xls=soc_2010_xls,
            soc_isco_crosswalk_xls=soc_isco_crosswalk_xls,
            rome_isco_crosswalk_xlsx=rome_isco_crosswalk_xlsx,
        ).mul(100).round(0).astype(int)
        # Mark 0 values as 1, as 0 means undefined.
        job_groups.loc[job_groups['automationRisk'] == 0, 'automationRisk'] = 1
        job_groups['automationRisk'].fillna(0, inplace=True)

    # Add best departements.
    if imt_market_score_csv:
        market_scores = cleaned_data.market_scores(filename=imt_market_score_csv)
        market_scores = market_scores[market_scores.AREA_TYPE_CODE == 'D'].\
            reset_index().\
            drop([
                'market_score',
                'yearly_avg_offers_denominator',
                'AREA_TYPE_CODE',
            ], axis='columns').\
            rename({
                'departement_id': 'district_id',
                'rome_id': 'job_group',
                'yearly_avg_offers_per_10_candidates': 'market_score',
            }, axis='columns')
        job_groups['departementScores'] = market_score_derivatives.get_less_stressful_districts(
            market_scores)
        # Fill NaN with empty [].
        job_groups['departementScores'] = job_groups.departementScores.apply(
            lambda s: s if isinstance(s, list) else [])
        # TODO(cyrille): Drop this, once we're sure it's no more used in server.
        job_groups['bestDepartements'] = job_groups.departementScores.apply(lambda ds: ds[:11])

        # Add national market score.
        # TODO(cyrille): Add this in market_score_derivatives.
        job_groups['nationalMarketScore'] = _get_national_market_scores(imt_market_score_csv)
        job_groups['nationalMarketScore'].fillna(0, inplace=True)

    # Add diploma requirements.
    if strict_diplomas_airtable:
        job_groups['is_diploma_strictly_required'] = _load_strict_diplomas_from_airtable(
            *strict_diplomas_airtable.split(':'))
        job_groups['is_diploma_strictly_required'].fillna(False, inplace=True)

    # Add job_boards.
    if jobboards_by_rome:
        job_groups['jobBoards'] = job_groups.index.map(jobboards_by_rome)

    # Add skills for the future.
    if skills_for_future_by_rome:
        job_groups['skillsForFuture'] = job_groups.index.map(skills_for_future_by_rome)

    # Add specific to job advice.
    if specific_to_job_by_rome:
        job_groups['specificAdvice'] = job_groups.index.map(specific_to_job_by_rome)

    # Add highest degree counts from user base.
    if users_highest_degrees is not None:
        job_groups['userDegrees'] = users_highest_degrees
        # Fill NaN with empty [].
        job_groups['userDegrees'] = job_groups.userDegrees.apply(
            lambda d: d if isinstance(d, list) else [])

    # Add training data.
    if trainings_csv:
        trainings = pandas.read_csv(trainings_csv)
        job_groups['trainingCount'] = trainings.groupby('formation.proximiteRomes.code')\
            .apply(_count_trainings)
        job_groups['trainingCount'] = job_groups.trainingCount.apply(
            lambda counts: counts if isinstance(counts, dict) else {})

    # Add no-requirement flag.
    job_groups['hasAnyRequirements'] = cleaned_data.jobs_without_qualifications(
        filename=rome_csv_pattern.format('item_arborescence'))\
        .no_requirements.map(lambda unused: 'FALSE')
    job_groups['hasAnyRequirements'].fillna('TRUE', inplace=True)

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return typing.cast(list[dict[str, Any]], job_groups.to_dict('records'))