def csv2dicts(rome_csv_pattern): """Import the ROME mobility data in MongoDB. We group the mobility data as JobGroups (we find a set of similar jobs either for a specific job or for a job group). To get all mobility data for a given job, you have to to look both for the data for this job (keyed by OGR code) and for the data for its job group (keyed by ROME code). As OGR code and ROME code use different namespaces there's no conflict to use it directly with its key. Args: rome_csv_pattern: pattern of paths to CSV files containing the ROME data. It must contain a '%s' that will be replaced by 'referentiel_code_rome', 'rubrique_mobilite' and 'referentiel_appellation'. """ mobility = pandas.read_csv(rome_csv_pattern % 'rubrique_mobilite', dtype=str) job_groups = cleaned_data.rome_job_groups(filename=rome_csv_pattern % 'referentiel_code_rome') jobs = cleaned_data.rome_jobs(filename=rome_csv_pattern % 'referentiel_appellation') jobs.index.name = 'codeOgr' masculine_job_names, feminine_job_names = (rome_genderization.genderize( jobs.name)) jobs['masculineName'] = masculine_job_names jobs['feminineName'] = feminine_job_names jobs_names = jobs.name jobs.reset_index(inplace=True) jobs_samples = jobs.groupby('code_rome').apply(_sample_jobs(3)) mobility.rename(columns={ 'code_rome': 'source_job_group', 'code_appellation_source': 'source_job', 'code_rome_cible': 'target_job_group', 'code_appellation_cible': 'target_job', }, inplace=True) mobility['target_job_group_name'] = (mobility.target_job_group.map( job_groups.name)) mobility.target_job_group_name.fillna('', inplace=True) mobility['target_job_group_samples'] = ( mobility.target_job_group.map(jobs_samples).fillna(False)) mobility['target_job_name'] = mobility.target_job.map(jobs_names) mobility.target_job_name.fillna('', inplace=True) mobility['target_job_masculine_name'] = mobility.target_job.map( masculine_job_names) mobility.target_job_masculine_name.fillna('', inplace=True) mobility['target_job_feminine_name'] = mobility.target_job.map( feminine_job_names) mobility.target_job_feminine_name.fillna('', inplace=True) return dataframe2dicts(mobility)
def main(rome_appellation_csv, output_txt): """Sample ROME jobs in job groups. Args: rome_appellation_csv: path to a CSV file containing all ROME jobs. output_txt: path where to create the output txt file. It will get populated with a list of masculine job names, one per line. """ jobs = cleaned_data.rome_jobs(filename=rome_appellation_csv) samples = jobs.groupby('code_rome').apply(lambda d: d.sample(1)) names, unused_ = rome_genderization.genderize(samples.name) with open(output_txt, 'w') as output: output.write('\n'.join(names.tolist()) + '\n')
def make_dicts(rome_csv_pattern, job_requirements_json, job_application_complexity_json, handcrafted_assets_airtable): """Import job info in MongoDB. Args: rome_csv_pattern: pattern of paths to CSV file containing the ROME data. It must contain a '%s' that will be replaced by 'referentiel_code_rome', 'referentiel_env_travail', 'liens_rome_referentiels' and 'referentiel_appellation'. job_requirements_json: path to a JSON file containing requirements per job group. job_application_complexity_json: path to a JSON file containing the application complexity of each job group. handcrafted_assets_airtable: the base ID and the table named joined by a ':' of the AirTable containing the advice per job group (short texts describing assets required). Returns: A list of dict that maps the JSON representation of JobGroup protos. """ job_groups = cleaned_data.rome_job_groups(filename=rome_csv_pattern % 'referentiel_code_rome') jobs = cleaned_data.rome_jobs(filename=rome_csv_pattern % 'referentiel_appellation') holland_codes = cleaned_data.rome_holland_codes( filename=rome_csv_pattern % 'referentiel_code_rome_riasec') rome_texts = cleaned_data.rome_texts(filename=rome_csv_pattern % 'texte') rome_work_environments = cleaned_data.rome_work_environments( links_filename=rome_csv_pattern % 'liens_rome_referentiels', ref_filename=rome_csv_pattern % 'referentiel_env_travail') handcrafted_assets = _load_assets_from_airtable( *handcrafted_assets_airtable.split(':')) # Genderize names. masculine, feminine = rome_genderization.genderize(jobs.name) jobs['masculineName'] = masculine jobs['feminineName'] = feminine # List jobs and pick samples. jobs.index.name = 'codeOgr' jobs.reset_index(inplace=True) jobs_grouped = jobs.groupby('code_rome') job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3)) job_groups['samples'] = job_groups.samples.apply( lambda s: s if isinstance(s, list) else []) job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None)) job_groups['jobs'] = job_groups.jobs.apply(lambda s: s if isinstance(s, list) else []) # Add skills. rome_to_skills = cleaned_data.rome_to_skills( filename_items=rome_csv_pattern % 'coherence_item', filename_skills=rome_csv_pattern % 'referentiel_competence') skills_grouped = rome_to_skills.groupby('code_rome') job_groups['requirements'] = skills_grouped.apply( _group_skills_as_proto_list) # Combine requirements from json file. with open(job_requirements_json) as job_requirements_file: job_requirements_list = json.load(job_requirements_file) job_requirements_dict = { job_requirement.pop('_id'): job_requirement for job_requirement in job_requirements_list } for job_group in job_groups.itertuples(): job_group.requirements.update( job_requirements_dict.get(job_group.Index, {})) # Combine requirements from AirTable. for job_group in job_groups.itertuples(): job_group.requirements.update( handcrafted_assets.get(job_group.Index, {})) application_complexity = pandas.read_json(job_application_complexity_json) application_complexity.set_index('_id', inplace=True) job_groups['applicationComplexity'] = application_complexity[ 'applicationComplexity'] # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes. # Will later be used for job similarity measures. job_groups['hollandCodeMajor'] = holland_codes.major job_groups['hollandCodeMinor'] = holland_codes.minor # Add description, working environment and requirement as text. job_groups['description'] = rome_texts.definition job_groups['workingEnvironment'] = rome_texts.working_environment job_groups['requirementsText'] = rome_texts.requirements # Add work environment items. job_groups['workEnvironmentKeywords'] = (rome_work_environments.groupby( 'code_rome').apply(_group_work_environment_items)) # Fill NaN with empty {}. job_groups['workEnvironmentKeywords'] = ( job_groups.workEnvironmentKeywords.apply( lambda k: k if isinstance(k, dict) else {})) # Set index as field. job_groups.index.name = 'romeId' job_groups.reset_index(inplace=True) job_groups['_id'] = job_groups['romeId'] return job_groups.to_dict('records')
def make_dicts(rome_csv_pattern, job_requirements_json, job_application_complexity_json, application_mode_csv, rome_fap_crosswalk_txt, handcrafted_assets_airtable, domains_airtable, info_by_prefix_airtable): """Import job info in MongoDB. Args: rome_csv_pattern: pattern of paths to CSV file containing the ROME data. It must contain a '{}' that will be replaced by 'referentiel_code_rome', 'referentiel_env_travail', 'liens_rome_referentiels' and 'referentiel_appellation'. job_requirements_json: path to a JSON file containing requirements per job group. job_application_complexity_json: path to a JSON file containing the application complexity of each job group. application_mode_csv: path to a CSV file containing the application mode data from emploi-store-dev API. rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk from FAP codes to ROME job group codes. handcrafted_assets_airtable: the base ID and the table named joined by a ':' of the AirTable containing the advice per job group (short texts describing assets required). domains_airtable: the base ID and the table name joined by a ':' of the AirTable containing the domain name for each sector. info_by_prefix_airtable: the base ID and the table name joined by a ':' of the AirTable containing some manually specified info for group of job group (by ROME ID prefix). Returns: A list of dict that maps the JSON representation of JobGroup protos. """ job_groups = cleaned_data.rome_job_groups( filename=rome_csv_pattern.format('referentiel_code_rome')) jobs = cleaned_data.rome_jobs( filename=rome_csv_pattern.format('referentiel_appellation')) holland_codes = cleaned_data.rome_holland_codes( filename=rome_csv_pattern.format('referentiel_code_rome_riasec')) rome_texts = cleaned_data.rome_texts( filename=rome_csv_pattern.format('texte')) rome_work_environments = cleaned_data.rome_work_environments( links_filename=rome_csv_pattern.format('liens_rome_referentiels'), ref_filename=rome_csv_pattern.format('referentiel_env_travail')) handcrafted_assets = _load_assets_from_airtable( *handcrafted_assets_airtable.split(':')) sector_domains = _load_domains_from_airtable(*domains_airtable.split(':')) info_by_prefix = _load_prefix_info_from_airtable( job_groups.index, *info_by_prefix_airtable.split(':')) application_modes = _get_application_modes(application_mode_csv, rome_fap_crosswalk_txt) # Genderize names. masculine, feminine = rome_genderization.genderize(jobs.name) jobs['masculineName'] = masculine jobs['feminineName'] = feminine # List jobs and pick samples. jobs.index.name = 'codeOgr' jobs.reset_index(inplace=True) jobs_grouped = jobs.groupby('code_rome') job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3)) job_groups['samples'] = job_groups.samples.apply( lambda s: s if isinstance(s, list) else []) job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None)) job_groups['jobs'] = job_groups.jobs.apply(lambda s: s if isinstance(s, list) else []) # Add info by prefix. job_groups = job_groups.join(info_by_prefix) # Add skills. rome_to_skills = cleaned_data.rome_to_skills( filename_items=rome_csv_pattern.format('coherence_item'), filename_skills=rome_csv_pattern.format('referentiel_competence')) skills_grouped = rome_to_skills.groupby('code_rome') job_groups['requirements'] = skills_grouped.apply( _group_skills_as_proto_list) # Replace NaN by empty dicts. job_groups['requirements'] = job_groups.requirements.apply( lambda r: r if isinstance(r, dict) else {}) # Combine requirements from json file. with open(job_requirements_json) as job_requirements_file: job_requirements_list = json.load(job_requirements_file) job_requirements_dict = { job_requirement.pop('_id'): job_requirement for job_requirement in job_requirements_list } for job_group in job_groups.itertuples(): job_group.requirements.update( job_requirements_dict.get(job_group.Index, {})) # Combine requirements from AirTable. for job_group in job_groups.itertuples(): job_group.requirements.update( handcrafted_assets.get(job_group.Index, {})) application_complexity = pandas.read_json(job_application_complexity_json) application_complexity.set_index('_id', inplace=True) job_groups['applicationComplexity'] = application_complexity[ 'applicationComplexity'] job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True) # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes. # Will later be used for job similarity measures. job_groups['hollandCodeMajor'] = holland_codes.major job_groups.hollandCodeMajor.fillna('', inplace=True) job_groups['hollandCodeMinor'] = holland_codes.minor job_groups.hollandCodeMinor.fillna('', inplace=True) # Add description, working environment and requirement as text. job_groups['description'] = rome_texts.definition job_groups.description.fillna('', inplace=True) job_groups['workingEnvironment'] = rome_texts.working_environment job_groups.workingEnvironment.fillna('', inplace=True) job_groups['requirementsText'] = rome_texts.requirements job_groups.requirementsText.fillna('', inplace=True) # Add work environment items. rome_work_environments['domain'] = rome_work_environments['name'].map( sector_domains) job_groups['workEnvironmentKeywords'] = \ rome_work_environments.groupby('code_rome').apply(_group_work_environment_items) # Fill NaN with empty {}. job_groups[ 'workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply( lambda k: k if isinstance(k, dict) else {}) # Add application modes. job_groups['applicationModes'] = application_modes job_groups['applicationModes'] = job_groups.applicationModes.apply( lambda m: m if isinstance(m, dict) else {}) # Set index as field. job_groups.index.name = 'romeId' job_groups.reset_index(inplace=True) job_groups['_id'] = job_groups['romeId'] return job_groups.to_dict('records')