def extract_offers_per_cities(offers_file, colnames, min_creation_date, data_folder='data'): """Extract the interesting cities in terms of number of offers for each job group. Args: offers_file: path of cvs file with offers. colnames: the names of the columns in the offer file. min_creation_date: the date from which we consider the offers. """ required_fields = { _CITY_CODE_FIELD, _JOB_GROUP_CODE_FIELD, _LATITUDE_CODE_FIELD, _LONGITUDE_CODE_FIELD, _CITY_NAME_CODE_FIELD} offers_rows = job_offers.iterate(offers_file, colnames, required_fields) city_data = _list_hiring_cities(offers_rows, min_creation_date, data_folder) # Computing the threshold per job group. job_group_threshold = collections.defaultdict(float) for job_group, offers in city_data.offers_per_job_group.items(): job_group_threshold[job_group] = math.pow(offers, 0.6) / 40 job_group_to_kept_cities = collections.defaultdict(list) for job_group, city_ids in city_data.job_group_to_city_ids.items(): kept_cities = [] for city_id, offer_count in city_ids.items(): if offer_count > job_group_threshold[job_group]: kept_cities.append({'city': city_data.city_info[city_id], 'offers': offer_count}) job_group_to_kept_cities[job_group] = sorted( kept_cities, key=lambda k: k['offers'], reverse=True) return [ {'_id': job_group_id, 'hiringCities': job_group_weighted_cities} for job_group_id, job_group_weighted_cities in job_group_to_kept_cities.items()]
def trim_job_offers_csv(in_csv, colnames_txt, out_csv, min_creation_date='', fields=_DEFAULT_FIELDS): """Trim job offers CSV. Args: in_csv: the path of the CSV file containing all job offers in the Pôle Emploi format (using latin-1 encoding, | separators, etc). colnames_txt: the TXT file containing the list of column names. out_csv: the path where to store the output CSV file. fields: the list of fields to keep, separated by commas. """ fieldnames = fields.split(',') all_job_offers = job_offers.iterate(in_csv, colnames_txt, required_fields=set(fieldnames + ['creation_date'])) with open(out_csv, 'w') as out_file: writer = csv.DictWriter(out_file, fieldnames=fieldnames) writer.writeheader() for job_offer in all_job_offers: if job_offer.creation_date < min_creation_date: continue writer.writerow( {field: getattr(job_offer, field) for field in fieldnames})
def test_missing_required_fields(self): """Test missing required field.""" offers = job_offers.iterate(path.join(self.testdata_folder, 'job_offers.csv'), path.join(self.testdata_folder, 'column_names.txt'), required_fields=set(['foobar'])) self.assertRaises(ValueError, next, offers)
def test_basic(self): """Test basic usage.""" offers = list( job_offers.iterate( path.join(self.testdata_folder, 'job_offers.csv'), path.join(self.testdata_folder, 'column_names.txt'))) # Golden values. self.assertEqual(8, len(offers)) self.assertEqual('000053Q', offers[0].id_offre) self.assertEqual('Contrat travail', offers[1].contract_nature_name)
def csv2dicts(job_offers_csv, colnames_txt, last_year='2015'): """Import the changes of # of job offers per job group and dept in MongoDB. Args: job_offers_csv: Path of the csv containing the data. colnames_txt: Path to a file containing the name of the CSV's columns. last_year: The year to consider to compute the metrics. Returns: Evolution data as a LocalJobStats JSON-proto compatible dict. """ counter = _EvolutionCounter(int(last_year)) for job_offer in job_offers.iterate( job_offers_csv, colnames_txt, _REQUIRED_FIELDS): counter.collect(job_offer) return list(counter.get_proto_dicts())
def csv2dicts(job_offers_csv, colnames_txt): """Import the requirement from job offers grouped by Job Group in MongoDB. Args: job_offers_csv: Path of the csv containing the data. colnames_txt: Path to a file containing the name of the CSV's columns. Returns: Requirements as a JobRequirements JSON-proto compatible dict. """ job_groups = collections.defaultdict(_RequirementsCollector) for job_offer in job_offers.iterate( job_offers_csv, colnames_txt, _REQUIRED_FIELDS): job_groups[job_offer.rome_profession_card_code].collect(job_offer) return [ dict(job_groups[job_group_id].get_proto_dict(), _id=job_group_id) for job_group_id in sorted(job_groups)]