def check_scoring_models(mongo_db: pymongo.database.Database) -> None: """Check that all scoring models are valid and warn on unused ones.""" scoring_model_fields = \ _list_formatted_fields(import_status.get_importers().items(), options_pb2.SCORING_MODEL_ID) used_scoring_models = set() records = list( _iterate_all_records(mongo_db, scoring_model_fields, 'scoring model')) for collection, unused_field, field_value, record_id in tqdm.tqdm(records): if isinstance(field_value, list): field_values = field_value else: field_values = [field_value] for scoring_model in field_values: if not scoring.get_scoring_model(scoring_model): logging.error( 'Unknown scoring model "%s" in the collection "%s", record "%s".', scoring_model, collection, record_id) used_scoring_models.add(scoring_model) # TODO(cyrille): Also check SCORING_MODEL_REGEXPS. unused_scoring_models = scoring.SCORING_MODELS.keys() - used_scoring_models for scoring_model in unused_scoring_models: logging.warning('Scoring model unused: %s', scoring_model)
def _group_filter_fields(self, record, record_name, field='filters', others=None): """Group multiple fields to specify filters. Args: record: the record to convert. record_name: the name of the type of record for error messages. field: the main field for filters, it should contain an array of filter IDs. others: a list of fields which, if not empty, create extra fields by combining the field name and their content, e.g. "for-departement" with value "75,69" would add a filter "for-departement(75,69)". Returns: A list of valid filters. Raises: ValueError: if one the filter is not implemented. """ filters = record['fields'].get(field, []) if others: for filter_type in others: filter_value = record['fields'].get(filter_type) if filter_value: filters.append('{}({})'.format(filter_type, filter_value)) for one_filter in filters: if not scoring.get_scoring_model(one_filter): raise ValueError( '{} uses the filter "{}" that is not implemented yet' .format(record_name, one_filter)) return filters
def _compute_extra_data(piece_of_advice, module, scoring_project): if not module.extra_data_field_name: return scoring_model = scoring.get_scoring_model(module.trigger_scoring_model) try: compute_extra_data = scoring_model.compute_extra_data except AttributeError: logging.warning( 'The scoring model %s has no compute_extra_data method', module.trigger_scoring_model) return try: extra_data = compute_extra_data(scoring_project) except Exception: # pylint: disable=broad-except logging.exception('Computing extra data "%s" crashed for:\n%s\n%s', module.trigger_scoring_model, scoring_project.user_profile, scoring_project.details) return if not extra_data: return try: data_field = getattr(piece_of_advice, module.extra_data_field_name) except NameError: logging.warning( 'The Advice proto does not have a %s field as requested by the module %s', module.extra_data_field_name, module.advice_id) return data_field.CopyFrom(extra_data)
def check_value(self, field_value: str) -> bool: """Whether this specific value passes the check or not.""" if not scoring.get_scoring_model(field_value): raise ValueError( f'The scoring model "{field_value}" is not implemented yet') return True
def _score(self, model: Union[str, scoring.ModelBase, None] = None, persona: Optional[_Persona] = None, name: Optional[str] = None) -> float: if isinstance(model, str): model = scoring.get_scoring_model(model) if model is None: # pragma: no-cover raise NotImplementedError( f'The model_id {model} is not the ID of any known model') return model.score(self._scoring_project(persona, name))
def _maybe_override_advice_data( piece_of_advice: project_pb2.Advice, module: advisor_pb2.AdviceModule, scoring_project: scoring.ScoringProject) -> None: scoring_model = scoring.get_scoring_model(module.trigger_scoring_model) if not scoring_model: return override_data = scoring_model.get_advice_override(scoring_project, piece_of_advice) if not override_data: # Nothing to override. return piece_of_advice.MergeFrom(override_data)
def _maybe_override_advice_data(piece_of_advice, module, scoring_project): scoring_model = scoring.get_scoring_model(module.trigger_scoring_model) try: get_advice_override = scoring_model.get_advice_override except AttributeError: # The scoring model has no get_advice_override method; return override_data = get_advice_override(scoring_project, piece_of_advice) if not override_data: # Nothing to override. return piece_of_advice.MergeFrom(override_data)
def convert_record(self, airtable_record): """Convert an AirTable record to a dict proto-Json ready.""" fields = super(_AdviceModuleConverter, self).convert_record(airtable_record) trigger_scoring_model = fields.get('triggerScoringModel') if not scoring.get_scoring_model(trigger_scoring_model): raise ValueError( 'Advice module "{}" uses the scoring model "{}" that is not implemented yet' .format(fields['_id'], trigger_scoring_model)) if 'emailFacts' in fields: fields['emailFacts'] = fields['emailFacts'].split('\n') return fields
def setUpClass(cls) -> None: super().setUpClass() if cls.model_id is None: # pragma: no-cover raise NotImplementedError(f'Add a model_id in "{cls.__name__}"') cls._patcher = mock.patch.dict(scoring.SCORING_MODELS, {}) # type: ignore cls._patcher.start() # type: ignore model = scoring.get_scoring_model(cls.model_id) if model is None: # pragma: no-cover raise NotImplementedError( f'The model_id {cls.model_id} is not the ID of any known model' ) cls.model = model
def _get_expanded_card_data(user_proto, project, advice_id): module = advisor.get_advice_module(advice_id, _DB) if not module or not module.trigger_scoring_model: flask.abort(404, 'Le module "{}" n\'existe pas'.format(advice_id)) model = scoring.get_scoring_model(module.trigger_scoring_model) if not model or not hasattr(model, 'get_expanded_card_data'): flask.abort( 404, 'Le module "{}" n\'a pas de données supplémentaires'.format( advice_id)) scoring_project = scoring.ScoringProject(project, user_proto.profile, user_proto.features_enabled, _DB, now=now.get()) return model.get_expanded_card_data(scoring_project)
def check_scoring_models(mongo_db): """Check that all scoring models are valid and warn on unused ones.""" used_scoring_models = set() records = list(_iterate_all_records(mongo_db, _SCORING_MODEL_FIELDS, 'scoring model')) for field_value, record, collection in tqdm.tqdm(records): if isinstance(field_value, list): field_values = field_value else: field_values = [field_value] for scoring_model in field_values: if not scoring.get_scoring_model(scoring_model): logging.error( 'Unknown scoring model "%s" in the collection "%s", record "%s".', scoring_model, collection, record['_id']) used_scoring_models.add(scoring_model) unused_scoring_models = scoring.SCORING_MODELS.keys() - used_scoring_models for scoring_model in unused_scoring_models: logging.warning('Scoring model unused: %s', scoring_model)
def _get_jobbing_vars( user: user_pb2.User, *, database: mongo.NoPiiMongoDatabase, **unused_kwargs: Any) -> dict[str, Any]: """Compute vars for the "Jobbing" email.""" if not user.projects: raise scoring.NotEnoughDataException('No project yet', {'projects.0'}) project = user.projects[0] if not any(s.strategy_id == 'diploma-free-job' for s in project.opened_strategies): raise campaign.DoNotSend( 'The user has not started a strategy to get a job without a diploma') scoring_project = scoring.ScoringProject(project, user, database) model = scoring.get_scoring_model('advice-reorient-jobbing') if not model: raise campaign.DoNotSend('The advice-reorient-jobbing model is not implemented') reorient_jobs = typing.cast( reorient_jobbing_pb2.JobbingReorientJobs, model.get_expanded_card_data(scoring_project), ).reorient_jobbing_jobs if not reorient_jobs: raise campaign.DoNotSend("We didn't find any jobbing jobs to reorient to for the user") if project.target_job.name: of_job_name = scoring_project.populate_template('%ofJobName') else: # This is not translated to fr@tu because the email templates are only in fr for now. of_job_name = 'de definir votre projet professionnel' return campaign.get_default_coaching_email_vars(user) | { 'inDepartement': scoring_project.populate_template('%inDepartement'), 'jobs': [{'name': job.name} for job in reorient_jobs], 'loginUrl': campaign.create_logged_url(user.user_id, f'/projet/{project.project_id}'), 'ofJobName': of_job_name, }
def _get_jobbing_vars(user: user_pb2.User, database: Optional[pymongo.database.Database] = None, **unused_kwargs: Any) -> Optional[Dict[str, Any]]: """Compute vars for the "Jobbing" email.""" project = user.projects[0] if not any(s.strategy_id == 'diploma-free-job' for s in project.opened_strategies): return None assert database scoring_project = scoring.ScoringProject(project, user, database) model = scoring.get_scoring_model('advice-reorient-jobbing') if not model: return None reorient_jobs = typing.cast( reorient_jobbing_pb2.JobbingReorientJobs, model.get_expanded_card_data(scoring_project), ).reorient_jobbing_jobs if not reorient_jobs: return None return dict( campaign.get_default_coaching_email_vars(user), **{ 'inDepartement': scoring_project.populate_template('%inDepartement'), 'jobs': [{ 'name': job.name } for job in reorient_jobs], 'loginUrl': campaign.create_logged_url(user.user_id, f'/projet/{project.project_id}'), 'ofJobName': scoring_project.populate_template('%ofJobName'), })
def test_inexistant_frustration(self): """Cannot make scoring model for inexistant frustration.""" with self.assertRaises(ValueError): scoring.get_scoring_model('for-frustrated(INEXISTANT)')
def setUpClass(cls): super(_TestCase, cls).setUpClass() cls.model_id = model_id cls.model = scoring.get_scoring_model(model_id)
def test_run_all(self, mock_carif_get_trainings): """Run all scoring models on all personas.""" mock_carif_get_trainings.return_value = [ training_pb2.Training(), training_pb2.Training(), training_pb2.Training(), ] database = mongomock.MongoClient().test _load_json_to_mongo(database, 'job_group_info') _load_json_to_mongo(database, 'local_diagnosis') _load_json_to_mongo(database, 'associations') _load_json_to_mongo(database, 'volunteering_missions') _load_json_to_mongo(database, 'hiring_cities') _load_json_to_mongo(database, 'cities') _load_json_to_mongo(database, 'departements') _load_json_to_mongo(database, 'seasonal_jobbing') _load_json_to_mongo(database, 'specific_to_job_advice') _load_json_to_mongo(database, 'reorient_jobbing') _load_json_to_mongo(database, 'reorient_to_close') scores = collections.defaultdict( lambda: collections.defaultdict(float)) # Mock the "now" date so that scoring models that are based on time # (like "Right timing") are deterministic. now = datetime.datetime(2016, 9, 27) for model_name in list(scoring.SCORING_MODELS.keys()): model = scoring.get_scoring_model(model_name) self.assertTrue(model, msg=model_name) scores[model_name] = {} for name, persona in _PERSONAS.items(): scoring_project = persona.scoring_project(database, now=now) try: score, explanations = model.score_and_explain( scoring_project) except scoring.NotEnoughDataException: score = -1 explanations = [] scores[model_name][name] = score self.assertIsInstance( scores[model_name][name], numbers.Number, msg='while using the model "{}" to score "{}"'.format( model_name, name)) self._assert_proper_explanations( explanations, scoring_project, msg='while using the model "{}" to explain the score of "{}"' .format(model_name, name)) for name in _PERSONAS: persona_scores = [ max(model_scores[name], 0) for model_scores in scores.values() ] self.assertLess( 1, len(set(persona_scores)), msg='Persona "{}" has the same score across all models.'. format(name)) model_scores_hashes = collections.defaultdict(list) for model_name, model_scores in scores.items(): model = scoring.SCORING_MODELS[model_name] if isinstance(model, scoring.ConstantScoreModel): continue self.assertLess( 1, len(set(model_scores.values())), msg='Model "{}" has the same score for all personas.'.format( model_name)) scores_hash = json.dumps(model_scores, sort_keys=True) model_scores_hashes[scores_hash].append(model_name) models_with_same_score = \ [models for models in model_scores_hashes.values() if len(models) > 1] self.assertFalse(models_with_same_score, msg='Some models always have the same scores')
def test_inexistant_passionate(self): """Cannot make scoring model for inexistant passionate level.""" with self.assertRaises(ValueError): scoring.get_scoring_model('for-passionate(INEXISTANT)')
def test_wrong_type_field(self): """Tries to create a filter based on a field that is not a binary experiment.""" with self.assertRaises(ValueError): scoring.get_scoring_model('for-active-experiment(alpha)')
def test_run_all(self, mock_carif_get_trainings: mock.MagicMock) -> None: """Run all scoring models on all personas.""" mock_carif_get_trainings.return_value = [ training_pb2.Training(), training_pb2.Training(), training_pb2.Training(), ] database = mongomock.MongoClient().test _load_json_to_mongo(database, 'associations') _load_json_to_mongo(database, 'cities') _load_json_to_mongo(database, 'departements') _load_json_to_mongo(database, 'hiring_cities') _load_json_to_mongo(database, 'job_group_info') _load_json_to_mongo(database, 'local_diagnosis') _load_json_to_mongo(database, 'online_salons') _load_json_to_mongo(database, 'reorient_jobbing') _load_json_to_mongo(database, 'reorient_to_close') _load_json_to_mongo(database, 'seasonal_jobbing') _load_json_to_mongo(database, 'skills_for_future') _load_json_to_mongo(database, 'specific_to_job_advice') _load_json_to_mongo(database, 'volunteering_missions') scores: Dict[str, Dict[str, float]] = \ collections.defaultdict(lambda: collections.defaultdict(float)) # Mock the "now" date so that scoring models that are based on time # (like "Right timing") are deterministic. now = datetime.datetime(2016, 9, 27) for model_name in list(scoring.SCORING_MODELS.keys()): model = scoring.get_scoring_model(model_name) if not model: # pragma: no-cover raise KeyError(f'No scoring model with name "{model_name}".') self.assertTrue(model, msg=model_name) scores[model_name] = {} for name, persona in _PERSONAS.items(): scoring_project = persona.scoring_project(database, now=now) try: score, explanations = model.score_and_explain( scoring_project) except scoring.NotEnoughDataException: score = -1 explanations = [] scores[model_name][name] = score self.assertIsInstance( scores[model_name][name], numbers.Number, msg= f'while using the model "{model_name}" to score "{name}"') self._assert_proper_explanations( explanations, scoring_project, msg= f'while using the model "{model_name}" to explain the score of "{name}"' ) for name in _PERSONAS: persona_scores = [ max(model_scores[name], 0) for model_scores in scores.values() ] self.assertLess( 1, len(set(persona_scores)), msg=f'Persona "{name}" has the same score across all models.') model_scores_hashes: Dict[str, Set[str]] = collections.defaultdict(set) # A mapping of renamings in progress. renamings = { 'for-exact-experienced(internship)': 'for-exact-experienced(intern)', } for base_name, target_name in renamings.items(): self.assertEqual( json.dumps(scores.pop(base_name), sort_keys=True), json.dumps(scores[target_name], sort_keys=True), msg= f'The model "{base_name}" is not consistent with its renaming "{target_name}"' ) for model_name, model_scores in scores.items(): model = scoring.SCORING_MODELS[model_name] if isinstance(model, scoring.ConstantScoreModel): continue self.assertLess( 1, len(set(model_scores.values())), msg=f'Model "{model_name}" has the same score for all personas.' ) scores_hash = json.dumps(model_scores, sort_keys=True) model_scores_hashes[scores_hash].add(model_name) models_with_same_score = \ [models for models in model_scores_hashes.values() if len(models) > 1] self.assertFalse(models_with_same_score, msg='Some models always have the same scores')
def _compute_diagnostic_topic_score_and_text(topic, scorers, scoring_project): """Create the score and text for a given diagnostic submetric on a given project. Args: topic: the diagnostic topic we wish to evaluate scorers: a list of scorers for the given topic, with their template sentences scoring_project: the project we want to score Returns: the populated subdiagnostic protobuf. """ topic_score = 0 topic_weight = 0 min_score = None max_score = None max_scorer = None min_scorer = None for scorer in scorers: model = scoring.get_scoring_model(scorer.trigger_scoring_model) if not model: logging.error( 'Diagnostic for topic "%s" uses the scoring model "%s" which does not exist.', diagnostic_pb2.DiagnosticTopic.Name(topic), scorer.trigger_scoring_model) continue try: score = model.score(scoring_project) except scoring.NotEnoughDataException: continue # Use default weight of 1 weight = scorer.weight or 1 weighted_score = score * weight topic_score += weighted_score topic_weight += weight # Use positive sentence only for scores above average. if score > _SCORE_AVERAGE: positive_score = (score - _SCORE_AVERAGE) * weight if max_score is None or positive_score > max_score: max_score = positive_score max_scorer = scorer # Use negative sentence only for scores below average. else: negative_score = (_SCORE_AVERAGE - score) * weight if min_score is None or negative_score > min_score: min_score = negative_score min_scorer = scorer if not topic_weight: return None sub_diagnostic = diagnostic_pb2.SubDiagnostic( topic=topic, score=round(topic_score / topic_weight * 100 / 3)) sentences = [] def _append_sentence(template): translated_template = scoring_project.translate_string(template) sentences.append( scoring_project.populate_template(translated_template)) # Do not put positive sentence if score is below 40. if max_scorer and sub_diagnostic.score > 40: _append_sentence(max_scorer.positive_sentence_template) # Do not put negative sentence if score is above 80. if min_scorer and sub_diagnostic.score < 80: _append_sentence(min_scorer.negative_sentence_template) sub_diagnostic.text = french.join_sentences_properly(sentences) return sub_diagnostic
def compute_advices_for_project(user, project, database, scoring_timeout_seconds=3): """Advise on a user project. Args: user: the user's data, mainly used for their profile and features_enabled. project: the project data. It will not be modified. database: access to the MongoDB with market data. Returns: an Advices protobuffer containing a list of recommendations. """ scoring_project = scoring.ScoringProject(project, user.profile, user.features_enabled, database, now=now.get()) scores = {} reasons = {} advice_modules = _advice_modules(database) advice = project_pb2.Advices() for module in advice_modules: if not module.is_ready_for_prod and not user.features_enabled.alpha: continue scoring_model = scoring.get_scoring_model(module.trigger_scoring_model) if scoring_model is None: logging.warning( 'Not able to score advice "%s", the scoring model "%s" is unknown.', module.advice_id, module.trigger_scoring_model) continue if user.features_enabled.all_modules: scores[module.advice_id] = 3 else: thread = threading.Thread(target=_compute_score_and_reasons, args=(scores, reasons, module, scoring_model, scoring_project)) thread.start() # TODO(pascal): Consider scoring different models in parallel. thread.join(timeout=scoring_timeout_seconds) if thread.is_alive(): logging.warning( 'Timeout while scoring advice "%s" for:\n%s\n%s', module.trigger_scoring_model, scoring_project.user_profile, scoring_project.details) modules = sorted(advice_modules, key=lambda m: (scores.get(m.advice_id, 0), m.advice_id), reverse=True) incompatible_modules = set() for module in modules: if not scores.get(module.advice_id): # We can break as others will have 0 score as well. break if module.airtable_id in incompatible_modules and not user.features_enabled.all_modules: continue piece_of_advice = advice.advices.add() piece_of_advice.advice_id = module.advice_id piece_of_advice.num_stars = scores.get(module.advice_id) piece_of_advice.explanations.extend( scoring_project.populate_template(reason) for reason in reasons.get(module.advice_id, [])) incompatible_modules.update(module.incompatible_advice_ids) _compute_extra_data(piece_of_advice, module, scoring_project) _maybe_override_advice_data(piece_of_advice, module, scoring_project) return advice
def _compute_available_methods( scoring_project: scoring.ScoringProject, method_modules: Iterable[advisor_pb2.AdviceModule], scoring_timeout_seconds: float ) -> Generator[project_pb2.Advice, None, Set[str]]: scores: Dict[str, float] = {} reasons: Dict[str, List[str]] = {} missing_fields: Set[str] = set() for module in method_modules: if not module.is_ready_for_prod and not scoring_project.features_enabled.alpha: continue scoring_model = scoring.get_scoring_model(module.trigger_scoring_model) if scoring_model is None: logging.warning( 'Not able to score advice "%s", the scoring model "%s" is unknown.', module.advice_id, module.trigger_scoring_model) continue if scoring_project.user.features_enabled.all_modules: scores[module.advice_id] = 3 else: thread = threading.Thread(target=_compute_score_and_reasons, args=(scores, reasons, module, scoring_model, scoring_project, missing_fields)) thread.start() # TODO(pascal): Consider scoring different models in parallel. thread.join(timeout=scoring_timeout_seconds) if thread.is_alive(): logging.warning('Timeout while scoring advice "%s" for:\n%s', module.trigger_scoring_model, scoring_project) modules = sorted(method_modules, key=lambda m: (scores.get(m.advice_id, 0), m.advice_id), reverse=True) incompatible_modules: Set[str] = set() has_module = False for module in modules: score = scores.get(module.advice_id) if not score: # We can break as others will have 0 score as well. break if module.airtable_id in incompatible_modules and \ not scoring_project.user.features_enabled.all_modules: continue piece_of_advice = project_pb2.Advice( advice_id=module.advice_id, num_stars=score, is_for_alpha_only=not module.is_ready_for_prod) piece_of_advice.explanations.extend( scoring_project.populate_template(reason) for reason in reasons.get(module.advice_id, [])) incompatible_modules.update(module.incompatible_advice_ids) _maybe_override_advice_data(piece_of_advice, module, scoring_project) has_module = True yield piece_of_advice if not has_module and method_modules: logging.warning( 'We could not find *any* advice for a project:\nModules tried:\n"%s"\nProject:\n%s', '", "'.join(m.advice_id for m in method_modules), scoring_project) return missing_fields
def test_unknown_field(self): """Tries to create a filter based on an experiment that does not exist.""" with self.assertRaises(ValueError): scoring.get_scoring_model('for-active-experiment(unknown)')