Exemplo n.º 1
0
 def _genderize_lists(self, names):
     """Helper function to call genderize using lists instead of Series."""
     masculine, feminine = rome_genderization.genderize(
         pandas.Series(names))
     self.assertEqual(len(masculine), len(names))
     self.assertEqual(len(feminine), len(names))
     return masculine.tolist(), feminine.tolist()
Exemplo n.º 2
0
def csv2dicts(rome_csv_pattern):
    """Import the ROME mobility data in MongoDB.

    We group the mobility data as JobGroups (we find a set of similar jobs
    either for a specific job or for a job group).

    To get all mobility data for a given job, you have to to look both for the
    data for this job (keyed by OGR code) and for the data for its job group
    (keyed by ROME code). As OGR code and ROME code use different namespaces
    there's no conflict to use it directly with its key.

    Args:
        rome_csv_pattern: pattern of paths to CSV files containing the ROME
            data. It must contain a '%s' that will be replaced by
            'referentiel_code_rome', 'rubrique_mobilite' and
            'referentiel_appellation'.
    """
    mobility = pandas.read_csv(rome_csv_pattern % 'rubrique_mobilite',
                               dtype=str)
    job_groups = cleaned_data.rome_job_groups(filename=rome_csv_pattern %
                                              'referentiel_code_rome')

    jobs = cleaned_data.rome_jobs(filename=rome_csv_pattern %
                                  'referentiel_appellation')
    jobs.index.name = 'codeOgr'
    masculine_job_names, feminine_job_names = (rome_genderization.genderize(
        jobs.name))
    jobs['masculineName'] = masculine_job_names
    jobs['feminineName'] = feminine_job_names
    jobs_names = jobs.name
    jobs.reset_index(inplace=True)
    jobs_samples = jobs.groupby('code_rome').apply(_sample_jobs(3))

    mobility.rename(columns={
        'code_rome': 'source_job_group',
        'code_appellation_source': 'source_job',
        'code_rome_cible': 'target_job_group',
        'code_appellation_cible': 'target_job',
    },
                    inplace=True)

    mobility['target_job_group_name'] = (mobility.target_job_group.map(
        job_groups.name))
    mobility.target_job_group_name.fillna('', inplace=True)
    mobility['target_job_group_samples'] = (
        mobility.target_job_group.map(jobs_samples).fillna(False))
    mobility['target_job_name'] = mobility.target_job.map(jobs_names)
    mobility.target_job_name.fillna('', inplace=True)
    mobility['target_job_masculine_name'] = mobility.target_job.map(
        masculine_job_names)
    mobility.target_job_masculine_name.fillna('', inplace=True)
    mobility['target_job_feminine_name'] = mobility.target_job.map(
        feminine_job_names)
    mobility.target_job_feminine_name.fillna('', inplace=True)

    return dataframe2dicts(mobility)
Exemplo n.º 3
0
def main(rome_appellation_csv, output_txt):
    """Sample ROME jobs in job groups.

    Args:
        rome_appellation_csv: path to a CSV file containing all ROME jobs.
        output_txt: path where to create the output txt file. It will get
            populated with a list of masculine job names, one per line.
    """
    jobs = cleaned_data.rome_jobs(filename=rome_appellation_csv)
    samples = jobs.groupby('code_rome').apply(lambda d: d.sample(1))
    names, unused_ = rome_genderization.genderize(samples.name)
    with open(output_txt, 'w') as output:
        output.write('\n'.join(names.tolist()) + '\n')
Exemplo n.º 4
0
def make_dicts(rome_csv_pattern, job_requirements_json,
               job_application_complexity_json, handcrafted_assets_airtable):
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '%s' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """
    job_groups = cleaned_data.rome_job_groups(filename=rome_csv_pattern %
                                              'referentiel_code_rome')
    jobs = cleaned_data.rome_jobs(filename=rome_csv_pattern %
                                  'referentiel_appellation')
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern % 'referentiel_code_rome_riasec')
    rome_texts = cleaned_data.rome_texts(filename=rome_csv_pattern % 'texte')
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern % 'liens_rome_referentiels',
        ref_filename=rome_csv_pattern % 'referentiel_env_travail')
    handcrafted_assets = _load_assets_from_airtable(
        *handcrafted_assets_airtable.split(':'))

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None))
    job_groups['jobs'] = job_groups.jobs.apply(lambda s: s
                                               if isinstance(s, list) else [])

    # Add skills.
    rome_to_skills = cleaned_data.rome_to_skills(
        filename_items=rome_csv_pattern % 'coherence_item',
        filename_skills=rome_csv_pattern % 'referentiel_competence')
    skills_grouped = rome_to_skills.groupby('code_rome')
    job_groups['requirements'] = skills_grouped.apply(
        _group_skills_as_proto_list)

    # Combine requirements from json file.
    with open(job_requirements_json) as job_requirements_file:
        job_requirements_list = json.load(job_requirements_file)
        job_requirements_dict = {
            job_requirement.pop('_id'): job_requirement
            for job_requirement in job_requirements_list
        }
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            job_requirements_dict.get(job_group.Index, {}))

    # Combine requirements from AirTable.
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            handcrafted_assets.get(job_group.Index, {}))

    application_complexity = pandas.read_json(job_application_complexity_json)
    application_complexity.set_index('_id', inplace=True)
    job_groups['applicationComplexity'] = application_complexity[
        'applicationComplexity']

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups['hollandCodeMinor'] = holland_codes.minor

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups['requirementsText'] = rome_texts.requirements

    # Add work environment items.
    job_groups['workEnvironmentKeywords'] = (rome_work_environments.groupby(
        'code_rome').apply(_group_work_environment_items))
    # Fill NaN with empty {}.
    job_groups['workEnvironmentKeywords'] = (
        job_groups.workEnvironmentKeywords.apply(
            lambda k: k if isinstance(k, dict) else {}))

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return job_groups.to_dict('records')
Exemplo n.º 5
0
def make_dicts(rome_csv_pattern, job_requirements_json,
               job_application_complexity_json, application_mode_csv,
               rome_fap_crosswalk_txt, handcrafted_assets_airtable,
               domains_airtable, info_by_prefix_airtable):
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
        application_mode_csv: path to a CSV file containing the application mode
            data from emploi-store-dev API.
        rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk
            from FAP codes to ROME job group codes.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
        domains_airtable: the base ID and the table name joined by a ':' of the
            AirTable containing the domain name for each sector.
        info_by_prefix_airtable: the base ID and the table name joined by a ':'
            of the AirTable containing some manually specified info for group of
            job group (by ROME ID prefix).
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """
    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))
    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern.format('referentiel_code_rome_riasec'))
    rome_texts = cleaned_data.rome_texts(
        filename=rome_csv_pattern.format('texte'))
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern.format('liens_rome_referentiels'),
        ref_filename=rome_csv_pattern.format('referentiel_env_travail'))
    handcrafted_assets = _load_assets_from_airtable(
        *handcrafted_assets_airtable.split(':'))
    sector_domains = _load_domains_from_airtable(*domains_airtable.split(':'))
    info_by_prefix = _load_prefix_info_from_airtable(
        job_groups.index, *info_by_prefix_airtable.split(':'))
    application_modes = _get_application_modes(application_mode_csv,
                                               rome_fap_crosswalk_txt)

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None))
    job_groups['jobs'] = job_groups.jobs.apply(lambda s: s
                                               if isinstance(s, list) else [])

    # Add info by prefix.
    job_groups = job_groups.join(info_by_prefix)

    # Add skills.
    rome_to_skills = cleaned_data.rome_to_skills(
        filename_items=rome_csv_pattern.format('coherence_item'),
        filename_skills=rome_csv_pattern.format('referentiel_competence'))
    skills_grouped = rome_to_skills.groupby('code_rome')
    job_groups['requirements'] = skills_grouped.apply(
        _group_skills_as_proto_list)
    # Replace NaN by empty dicts.
    job_groups['requirements'] = job_groups.requirements.apply(
        lambda r: r if isinstance(r, dict) else {})

    # Combine requirements from json file.
    with open(job_requirements_json) as job_requirements_file:
        job_requirements_list = json.load(job_requirements_file)
        job_requirements_dict = {
            job_requirement.pop('_id'): job_requirement
            for job_requirement in job_requirements_list
        }
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            job_requirements_dict.get(job_group.Index, {}))

    # Combine requirements from AirTable.
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            handcrafted_assets.get(job_group.Index, {}))

    application_complexity = pandas.read_json(job_application_complexity_json)
    application_complexity.set_index('_id', inplace=True)
    job_groups['applicationComplexity'] = application_complexity[
        'applicationComplexity']
    job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY',
                                            inplace=True)

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups.hollandCodeMajor.fillna('', inplace=True)
    job_groups['hollandCodeMinor'] = holland_codes.minor
    job_groups.hollandCodeMinor.fillna('', inplace=True)

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups.description.fillna('', inplace=True)
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups.workingEnvironment.fillna('', inplace=True)
    job_groups['requirementsText'] = rome_texts.requirements
    job_groups.requirementsText.fillna('', inplace=True)

    # Add work environment items.
    rome_work_environments['domain'] = rome_work_environments['name'].map(
        sector_domains)
    job_groups['workEnvironmentKeywords'] = \
        rome_work_environments.groupby('code_rome').apply(_group_work_environment_items)
    # Fill NaN with empty {}.
    job_groups[
        'workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply(
            lambda k: k if isinstance(k, dict) else {})

    # Add application modes.
    job_groups['applicationModes'] = application_modes
    job_groups['applicationModes'] = job_groups.applicationModes.apply(
        lambda m: m if isinstance(m, dict) else {})

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return job_groups.to_dict('records')