def run(self) -> beam.PCollection[job_run_result.JobRunResult]: collection_pairs = ( self.pipeline | 'get collection models ' >> ndb_io.GetModels( collection_models.CollectionRightsModel.get_all()) | 'Flatten owner_ids and format' >> beam.FlatMap( self._extract_user_and_collection_ids)) user_pairs = (self.pipeline | 'Get all user settings models' >> ndb_io.GetModels( user_models.UserSettingsModel.get_all()) | 'Extract id and email' >> beam.Map(lambda user_setting: (user_setting.id, user_setting.email))) collection_ids_to_email_mapping = ( (collection_pairs, user_pairs) | 'Group by user_id' >> beam.CoGroupByKey() | 'Drop user id' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Filter out results without any collection' >> beam.Filter(lambda collection_ids_and_email: len( collection_ids_and_email[0]) > 0)) return ( collection_ids_to_email_mapping | 'Get final result' >> beam.MapTuple(lambda collection, email: job_run_result.JobRunResult .as_stdout('collection_ids: %s, email: %s' % (collection, email))))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: user_settings_models = ( self.pipeline | 'Get all UserSettingsModels' >> (ndb_io.GetModels(user_models.UserSettingsModel.get_all()))) old_user_stats_models = ( self.pipeline | 'Get all UserStatsModels' >> (ndb_io.GetModels(user_models.UserStatsModel.get_all()))) # Creates UserStatsModels if it does not exists. new_user_stats_models = ( (user_settings_models, old_user_stats_models) | 'Merge models' >> beam.Flatten() # Returns a PCollection of # (model.id, (user_settings_models, user_stats_models)) or # (model.id, (user_settings_models,)). | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id) # Discards model.id from the PCollection. | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter # Only keep groupings that indicate that # the UserStatsModel is missing. | 'Filter pairs of models' >> beam.Filter(lambda models: (len(list(models)) == 1 and isinstance( list(models)[0], user_models.UserSettingsModel))) # Choosing the first element. | 'Transform tuples into models' >> beam.Map(lambda models: list(models)[0]) # Creates the missing UserStatsModels. | 'Create new user stat models' >> beam.ParDo( CreateUserStatsModel())) unused_put_result = ( (new_user_stats_models, old_user_stats_models) | 'Merge new and old models together' >> beam.Flatten() | 'Update the dashboard stats' >> beam.ParDo( UpdateWeeklyCreatorStats()) | 'Put models into the datastore' >> ndb_io.PutModels()) new_user_stats_job_result = ( new_user_stats_models | 'Count all new models' >> beam.combiners.Count.Globally() | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS NEW %s' % x))) old_user_stats_job_result = ( old_user_stats_models | 'Count all old models' >> beam.combiners.Count.Globally() | 'Only create result for old models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for old models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS OLD %s' % x))) return ((new_user_stats_job_result, old_user_stats_job_result) | 'Merge new and old results together' >> beam.Flatten())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. """ topics = (self.pipeline | 'Get all non-deleted topic models' >> (ndb_io.GetModels( topic_models.TopicModel.get_all(include_deleted=False))) | 'Get topic from model' >> beam.Map( topic_fetchers.get_topic_from_model)) story_ids_to_story = ( self.pipeline | 'Get all non-deleted story models' >> ndb_io.GetModels( story_models.StoryModel.get_all(include_deleted=False)) | 'Get story from model' >> beam.Map( story_fetchers.get_story_from_model) | 'Combine stories and ids' >> beam.Map(lambda story: (story.id, story))) exp_ids_to_exp = ( self.pipeline | 'Get all non-deleted exp models' >> ndb_io.GetModels( exp_models.ExplorationModel.get_all(include_deleted=False)) | 'Get exploration from model' >> beam.Map( exp_fetchers.get_exploration_from_model) | 'Combine exploration and ids' >> beam.Map(lambda exp: (exp.id, exp))) stories_dict = beam.pvalue.AsDict(story_ids_to_story) exps_dict = beam.pvalue.AsDict(exp_ids_to_exp) opportunities_results = ( topics | beam.Map(self._generate_opportunities_related_to_topic, stories_dict=stories_dict, exps_dict=exps_dict)) unused_put_result = ( opportunities_results | 'Filter the results with SUCCESS status' >> beam.Filter(lambda result: result.is_ok()) | 'Fetch the models to be put' >> beam.FlatMap(lambda result: result.unwrap()) | 'Add ID as a key' >> beam.WithKeys(lambda model: model.id) # pylint: disable=no-value-for-parameter | 'Allow only one item per key' >> (beam.combiners.Sample.FixedSizePerKey(1)) | 'Remove the IDs' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Flatten the list of lists of models' >> beam.FlatMap(lambda x: x) | 'Put models into the datastore' >> ndb_io.PutModels()) return (opportunities_results | 'Count the output' >> (job_result_transforms.ResultsToJobRunResults()))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Generates the translation contributins stats. Returns: PCollection. A PCollection of 'SUCCESS x' results, where x is the number of generated stats.. """ suggestions_grouped_by_target = ( self.pipeline | 'Get all non-deleted suggestion models' >> ndb_io.GetModels( suggestion_models.GeneralSuggestionModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Filter translate suggestions' >> beam.Filter(lambda m: ( m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT)) | 'Transform to suggestion domain object' >> beam.Map( suggestion_services.get_suggestion_from_model) | 'Group by target' >> beam.GroupBy(lambda m: m.target_id)) exp_opportunities = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Transform to opportunity domain object' >> beam.Map(opportunity_services. get_exploration_opportunity_summary_from_model) | 'Group by ID' >> beam.GroupBy(lambda m: m.id)) new_user_stats_models = ( { 'suggestion': suggestions_grouped_by_target, 'opportunity': exp_opportunities } | 'Merge models' >> beam.CoGroupByKey() | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats( x['suggestion'][0] if len(x['suggestion']) else [], list(x['opportunity'][0])[0] if len(x['opportunity']) else None)) | 'Combine the stats' >> beam.CombinePerKey(CombineStats()) | 'Generate models from stats' >> beam.MapTuple( self._generate_translation_contribution_model)) unused_put_result = ( new_user_stats_models | 'Put models into the datastore' >> ndb_io.PutModels()) return (new_user_stats_models | 'Count all new models' >> (beam.combiners.Count.Globally().without_defaults()) | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS %s' % x)))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. """ topics = (self.pipeline | 'Get all non-deleted topic models' >> (ndb_io.GetModels( topic_models.TopicModel.get_all(include_deleted=False))) | 'Get topic from model' >> beam.Map( topic_fetchers.get_topic_from_model)) story_ids_to_story = ( self.pipeline | 'Get all non-deleted story models' >> ndb_io.GetModels( story_models.StoryModel.get_all(include_deleted=False)) | 'Get story from model' >> beam.Map( story_fetchers.get_story_from_model) | 'Combine stories and ids' >> beam.Map(lambda story: (story.id, story))) exp_ids_to_exp = ( self.pipeline | 'Get all non-deleted exp models' >> ndb_io.GetModels( exp_models.ExplorationModel.get_all(include_deleted=False)) | 'Get exploration from model' >> beam.Map( exp_fetchers.get_exploration_from_model) | 'Combine exploration and ids' >> beam.Map(lambda exp: (exp.id, exp))) stories_dict = beam.pvalue.AsDict(story_ids_to_story) exps_dict = beam.pvalue.AsDict(exp_ids_to_exp) opportunities_results = ( topics | beam.Map(self._generate_opportunities_related_to_topic, stories_dict=stories_dict, exps_dict=exps_dict)) unused_put_result = ( opportunities_results | 'Filter the results with SUCCESS status' >> beam.Filter(lambda result: result['status'] == 'SUCCESS') | 'Fetch the models to be put' >> beam.FlatMap(lambda result: result['models']) | 'Put models into the datastore' >> ndb_io.PutModels()) return (opportunities_results | 'Fetch the job results' >> beam.Map(lambda result: result['job_result']))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: # Pcollection that returns the relevant config property with batch # index. config_property = ( self.pipeline | 'Get all config properties' >> ndb_io.GetModels( config_models.ConfigPropertyModel.get_all()) | 'Get the batch_index_for_mailchimp property value' >> beam.Filter( lambda model: model.id == 'batch_index_for_mailchimp') | 'Get value' >> beam.Map(lambda model: model.value) ) batch_index_dict = beam.pvalue.AsSingleton(config_property) # PCollection with all user ids that have opted in for email # newsletters. relevant_user_ids = ( self.pipeline | 'Get all UserEmailPreferencesModel' >> ndb_io.GetModels( user_models.UserEmailPreferencesModel.get_all().filter( user_models.UserEmailPreferencesModel.site_updates == True # pylint: disable=singleton-comparison )) | 'Extract user ID' >> beam.Map( lambda preferences_model: preferences_model.id) ) valid_user_ids = beam.pvalue.AsIter(relevant_user_ids) # PCollection of all user emails opted in for newsletters. relevant_user_emails = ( self.pipeline | 'Get all user settings models' >> ndb_io.GetModels( user_models.UserSettingsModel.get_all()) | 'Filter user models' >> ( beam.Filter( lambda model, ids: model.id in ids, ids=valid_user_ids)) | 'Get email' >> (beam.Map(lambda model: model.email)) ) mailchimp_results = ( relevant_user_emails # A large batch size is given so that all emails are included in a # single list. | 'Combine into a list' >> beam.CombineGlobally(CombineItems()) | 'Send mailchimp request for current batch' >> beam.ParDo( SendBatchMailchimpRequest(), batch_index_dict=batch_index_dict, test_run=True) | 'Get final result' >> beam.Map( lambda result: job_run_result.JobRunResult.as_stdout( result.value)) ) return mailchimp_results
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from deleting ExplorationOpportunitySummaryModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from deleting ExplorationOpportunitySummaryModel. """ exp_opportunity_summary_model = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False)) ) unused_delete_result = ( exp_opportunity_summary_model | beam.Map(lambda model: model.key) | 'Delete all models' >> ndb_io.DeleteModels() ) return ( exp_opportunity_summary_model | 'Create job run result' >> ( job_result_transforms.CountObjectsToJobRunResult()) )
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: exp_models_pcoll = (self.pipeline | 'Get all ExplorationModels' >> ndb_io.GetModels( exp_models.ExplorationModel.get_all())) exp_models_filtered = (exp_models_pcoll | 'Filter Math ExplorationModels' >> beam.Filter(self.contains_math_interactions)) exp_models_with_states = ( exp_models_filtered | 'Mapping exp_ids with states' >> (beam.FlatMap(self.flat_map_exp_with_states))) exp_models_with_states_filtered = ( exp_models_with_states | 'Filtering out states without math interactions' >> (beam.Filter(lambda tup: tup[2]['interaction']['id'] in feconf. MATH_INTERACTION_IDS))) exp_models_with_states_and_rules = ( exp_models_with_states_filtered | 'Mapping with rule types list' >> (beam.Map(self.map_with_rule_types))) return ( exp_models_with_states_and_rules | 'Final output' >> beam.Map(job_run_result.JobRunResult.as_stdout))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from deleting ExplorationOpportunitySummaryModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from deleting ExplorationOpportunitySummaryModel. """ exp_opportunity_summary_model = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False))) unused_delete_result = (exp_opportunity_summary_model | beam.Map(lambda model: model.key) | 'Delete all models' >> ndb_io.DeleteModels()) return (exp_opportunity_summary_model | 'Count all new models' >> beam.combiners.Count.Globally() | 'Only create result for new models when > 0' >> (beam.Filter(lambda n: n > 0)) | 'Create result for new models' >> beam.Map(lambda n: job_run_result.JobRunResult( stdout='SUCCESS %s' % n)))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. """ exp_summary_models = ( self.pipeline | 'Get all non-deleted models' >> (ndb_io.GetModels(exp_models.ExpSummaryModel.get_all()))) exp_summary_iter = beam.pvalue.AsIter(exp_summary_models) exp_recommendations_models = ( exp_summary_models | 'Compute similarity' >> beam.ParDo(ComputeSimilarity(), exp_summary_iter) | 'Group similarities per exploration ID' >> beam.GroupByKey() | 'Sort and slice similarities' >> beam.MapTuple( lambda exp_id, similarities: (exp_id, self._sort_and_slice_similarities(similarities))) | 'Create recommendation models' >> beam.MapTuple( self._create_recommendation)) unused_put_result = ( exp_recommendations_models | 'Put models into the datastore' >> ndb_io.PutModels()) return (exp_recommendations_models | 'Create job run result' >> (job_result_transforms.CountObjectsToJobRunResult()))
def run(self) -> beam.Pipeline: return (self.pipeline | 'Get every Blog Summary Model' >> (ndb_io.GetModels(blog_models.BlogPostSummaryModel.query())) | GetModelsWithDuplicatePropertyValues('title') | 'Flatten models into a list of errors' >> beam.FlatMap(lambda models: [ blog_validation_errors.DuplicateBlogTitleError(model) for model in models ]))
def run( self ) -> beam.PCollection[blog_validation_errors.DuplicateBlogUrlError]: return (self.pipeline | 'Get every Blog Post Model' >> (ndb_io.GetModels(blog_models.BlogPostModel.query())) | GetModelsWithDuplicatePropertyValues('url_fragment') | 'Flatten models into a list of errors' >> beam.FlatMap(lambda models: [ blog_validation_errors.DuplicateBlogUrlError(model) for model in models ]))
def test_read_from_datastore(self) -> None: model_list = [ self.create_model(base_models.BaseModel, id='a'), self.create_model(base_models.BaseModel, id='b'), self.create_model(base_models.BaseModel, id='c'), ] self.put_multi(model_list) self.assertItemsEqual(self.get_base_models(), model_list) # type: ignore[no-untyped-call] model_pcoll = (self.pipeline | ndb_io.GetModels(base_models.BaseModel.get_all())) self.assert_pcoll_equal(model_pcoll, model_list)
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. """ return ( self.pipeline | 'Get all non-deleted models' >> (ndb_io.GetModels( exp_models.ExpSummaryModel.get_all(include_deleted=False))) | 'Split models into batches' >> beam.transforms.util.BatchElements( max_batch_size=self.MAX_BATCH_SIZE) | 'Index batches of models' >> beam.ParDo( IndexExplorationSummaries()))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns PCollection of invalid explorations with their id and actual length. Returns: PCollection. Returns PCollection of invalid explorations with their id and actual length. """ total_explorations = ( self.pipeline | 'Get all ExplorationModels' >> ndb_io.GetModels( exp_models.ExplorationModel.get_all(include_deleted=False)) | 'Get exploration from model' >> beam.Map( exp_fetchers.get_exploration_from_model)) exp_ids_with_exceeding_max_title_len = ( total_explorations | 'Combine exploration title and ids' >> beam.Map(lambda exp: (exp.id, exp.title)) | 'Filter exploraton with title length greater than 36' >> beam.Filter(lambda exp: len(exp[1]) > 36)) report_number_of_exps_queried = ( total_explorations | 'Report count of exp models' >> (job_result_transforms.CountObjectsToJobRunResult('EXPS'))) report_number_of_invalid_exps = ( exp_ids_with_exceeding_max_title_len | 'Report count of invalid exp models' >> (job_result_transforms.CountObjectsToJobRunResult('INVALID'))) report_invalid_ids_and_their_actual_len = ( exp_ids_with_exceeding_max_title_len | 'Save info on invalid exps' >> beam.Map(lambda objects: job_run_result.JobRunResult. as_stderr('The id of exp is %s and its actual len is %s' % (objects[0], len(objects[1]))))) return ((report_number_of_exps_queried, report_number_of_invalid_exps, report_invalid_ids_and_their_actual_len) | 'Combine results' >> beam.Flatten())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from matching entity_type as collection. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from matching entity_type as collection. """ feedback_model_matched_as_collection = ( self.pipeline | 'Get all GeneralFeedbackThread models' >> ndb_io.GetModels( feedback_models.GeneralFeedbackThreadModel.get_all()) | 'Extract entity_type' >> beam.Map(lambda feeback_model: feeback_model.entity_type) | 'Match entity_type' >> beam.Filter(lambda entity_type: entity_type == 'collection')) return (feedback_model_matched_as_collection | 'Count the output' >> (job_result_transforms.CountObjectsToJobRunResult()))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Generates the translation contributins stats. Returns: PCollection. A PCollection of 'SUCCESS x' results, where x is the number of generated stats.. """ suggestions_grouped_by_target = ( self.pipeline | 'Get all non-deleted suggestion models' >> ndb_io.GetModels( suggestion_models.GeneralSuggestionModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Filter translate suggestions' >> beam.Filter(lambda m: ( m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT)) | 'Transform to suggestion domain object' >> beam.Map( suggestion_services.get_suggestion_from_model) | 'Group by target' >> beam.GroupBy(lambda m: m.target_id)) exp_opportunities = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Transform to opportunity domain object' >> beam.Map(opportunity_services. get_exploration_opportunity_summary_from_model) | 'Group by ID' >> beam.GroupBy(lambda m: m.id)) user_stats_results = ( { 'suggestion': suggestions_grouped_by_target, 'opportunity': exp_opportunities } | 'Merge models' >> beam.CoGroupByKey() | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats( x['suggestion'][0] if len(x['suggestion']) else [], list(x['opportunity'][0])[0] if len(x['opportunity']) else None))) user_stats_models = ( user_stats_results | 'Filter ok results' >> beam.Filter(lambda key_and_result: key_and_result[1].is_ok()) | 'Unpack result' >> beam.MapTuple(lambda key, result: (key, result.unwrap())) | 'Combine the stats' >> beam.CombinePerKey(CombineStats()) | 'Generate models from stats' >> beam.MapTuple( self._generate_translation_contribution_model)) user_stats_error_job_run_results = ( user_stats_results | 'Filter err results' >> beam.Filter(lambda key_and_result: key_and_result[1].is_err()) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Remove keys' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Transform result to job run result' >> (job_result_transforms.ResultsToJobRunResults())) unused_put_result = ( user_stats_models | 'Put models into the datastore' >> ndb_io.PutModels()) user_stats_models_job_run_results = ( user_stats_models | 'Create job run result' >> (job_result_transforms.CountObjectsToJobRunResult())) return ((user_stats_error_job_run_results, user_stats_models_job_run_results) | 'Merge job run results' >> beam.Flatten())
def run(self): """Returns a PCollection of audit errors aggregated from all models. Returns: PCollection. A PCollection of audit errors discovered during the audit. """ existing_models, deleted_models = ( self.pipeline | 'Get all models' >> (ndb_io.GetModels(datastore_services.query_everything())) | 'Partition by model.deleted' >> (beam.Partition(lambda model, _: int(model.deleted), 2))) models_of_kind_by_index = ( existing_models # NOTE: Partition returns a statically-sized list of PCollections. # Creating partitions is wasteful when there are fewer items than # there are partitions, like in our unit tests. In exchange, in # production the job will be able to take advantage of the high # parallelizability of PCollections, which are designed for enormous # datasets and parallel processing. # # Alternatively, we could have used GroupBy. However, that returns # an _iterable_ of items rather than a PCollection, and so it is # vulnerable to out-of-memory errors. # # Since this job is concerned with running audits on EVERY MODEL IN # STORAGE, Partition is the clear winner regardless of the overhead # we'll see in unit tests. | 'Split models into parallelizable PCollections' >> beam.Partition( lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)), # NOTE: Partition requires a hard-coded number of slices; it # cannot be used with dynamic numbers generated in a pipeline. # KIND_BY_INDEX is a constant tuple so that requirement is # satisfied in this case. len(KIND_BY_INDEX), KIND_BY_INDEX)) existing_key_count_pcolls = [] missing_key_error_pcolls = [] audit_error_pcolls = [ deleted_models | 'Apply ValidateDeletedModel on deleted models' >> (beam.ParDo(base_validation.ValidateDeletedModel())) ] model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index) for kind, models_of_kind in model_groups: audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind)) if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES: existing_key_count_pcolls.append( models_of_kind | GetExistingModelKeyCounts(kind)) if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR: missing_key_error_pcolls.extend( models_of_kind | GetMissingModelKeyErrors(kind)) existing_key_counts = ( existing_key_count_pcolls | 'Flatten PCollections of existing key counts' >> beam.Flatten()) missing_key_errors = ( missing_key_error_pcolls | 'Flatten PCollections of missing key errors' >> beam.Flatten()) audit_error_pcolls.append( (existing_key_counts, missing_key_errors) | 'Group counts and errors by key' >> beam.CoGroupByKey() | 'Filter keys without any errors' >> (beam.FlatMapTuple(self._get_model_relationship_errors))) return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of results from the story migration. Returns: PCollection. A PCollection of results from the story migration. """ unmigrated_story_models = ( self.pipeline | 'Get all non-deleted story models' >> ( ndb_io.GetModels(story_models.StoryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_model: story_model.id) ) story_summary_models = ( self.pipeline | 'Get all non-deleted story summary models' >> ( ndb_io.GetModels(story_models.StorySummaryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story summary keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_summary_model: story_summary_model.id) ) topics = ( self.pipeline | 'Get all non-deleted topic models' >> ( ndb_io.GetModels(topic_models.TopicModel.get_all())) | 'Transform model into domain object' >> beam.Map( topic_fetchers.get_topic_from_model) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add topic keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda topic: topic.id) ) topic_id_to_topic = beam.pvalue.AsDict(topics) migrated_story_results = ( unmigrated_story_models | 'Transform and migrate model' >> beam.MapTuple( self._migrate_story, topic_id_to_topic=topic_id_to_topic) ) migrated_stories = ( migrated_story_results | 'Filter oks' >> beam.Filter( lambda result_item: result_item.is_ok()) | 'Unwrap ok' >> beam.Map( lambda result_item: result_item.unwrap()) ) migrated_story_job_run_results = ( migrated_story_results | 'Generate results for migration' >> ( job_result_transforms.ResultsToJobRunResults('STORY PROCESSED')) ) story_changes = ( unmigrated_story_models | 'Generate story changes' >> beam.FlatMapTuple( self._generate_story_changes) ) story_objects_list = ( { 'story_model': unmigrated_story_models, 'story_summary_model': story_summary_models, 'story': migrated_stories, 'story_change': story_changes } | 'Merge objects' >> beam.CoGroupByKey() | 'Get rid of ID' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Remove unmigrated stories' >> beam.Filter( lambda x: len(x['story_change']) > 0 and len(x['story']) > 0) | 'Reorganize the story objects' >> beam.Map(lambda objects: { 'story_model': objects['story_model'][0], 'story_summary_model': objects['story_summary_model'][0], 'story': objects['story'][0], 'story_change': objects['story_change'][0] }) ) story_objects_list_job_run_results = ( story_objects_list | 'Transform story objects into job run results' >> ( job_result_transforms.CountObjectsToJobRunResult( 'STORY MIGRATED')) ) cache_deletion_job_run_results = ( story_objects_list | 'Delete story from cache' >> beam.Map( lambda story_objects: self._delete_story_from_cache( story_objects['story'])) | 'Generate results for cache deletion' >> ( job_result_transforms.ResultsToJobRunResults('CACHE DELETION')) ) story_models_to_put = ( story_objects_list | 'Generate story models to put' >> beam.FlatMap( lambda story_objects: self._update_story( story_objects['story_model'], story_objects['story'], story_objects['story_change'], )) ) story_summary_models_to_put = ( story_objects_list | 'Generate story summary models to put' >> beam.Map( lambda story_objects: self._update_story_summary( story_objects['story'], story_objects['story_summary_model'] )) ) unused_put_results = ( (story_models_to_put, story_summary_models_to_put) | 'Merge models' >> beam.Flatten() | 'Put models into the datastore' >> ndb_io.PutModels() ) return ( ( cache_deletion_job_run_results, migrated_story_job_run_results, story_objects_list_job_run_results ) | beam.Flatten() )
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from generating SkillOpportunityModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from generating SkillOpportunityModel. """ question_skill_link_models = ( self.pipeline | 'Get all non-deleted QuestionSkillLinkModels' >> (ndb_io.GetModels( question_models.QuestionSkillLinkModel.get_all( include_deleted=False))) | 'Group QuestionSkillLinkModels by skill ID' >> beam.GroupBy(lambda n: n.skill_id)) skills = ( self.pipeline | 'Get all non-deleted SkillModels' >> (ndb_io.GetModels( skill_models.SkillModel.get_all(include_deleted=False))) | 'Get skill object from model' >> beam.Map( skill_fetchers.get_skill_from_model) | 'Group skill objects by skill ID' >> beam.GroupBy(lambda m: m.id)) skills_with_question_counts = ( { 'skill': skills, 'question_skill_links': question_skill_link_models } | 'Merge by skill ID' >> beam.CoGroupByKey() # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Remove skill IDs' >> beam.Values() # pylint: disable=no-value-for-parameter # We are using itertools.chain.from_iterable to flatten # question_skill_links from a 2D list into a 1D list. | 'Flatten skill and question_skill_links' >> beam.Map( lambda object: { 'skill': list(object['skill'][0])[0], 'question_skill_links': list( itertools.chain.from_iterable(object[ 'question_skill_links'])) })) opportunities_results = ( skills_with_question_counts | beam.Map(lambda object: self._create_skill_opportunity_model( object['skill'], object['question_skill_links']))) unused_put_result = ( opportunities_results | 'Filter the results with OK status' >> beam.Filter(lambda result: result.is_ok()) | 'Fetch the models to be put' >> beam.Map(lambda result: result.unwrap()) | 'Put models into the datastore' >> ndb_io.PutModels()) return (opportunities_results | 'Transform Results to JobRunResults' >> (job_result_transforms.ResultsToJobRunResults()))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of results from the skill migration. Returns: PCollection. A PCollection of results from the skill migration. """ unmigrated_skill_models = ( self.pipeline | 'Get all non-deleted skill models' >> (ndb_io.GetModels(skill_models.SkillModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add skill model ID' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda skill_model: skill_model.id)) skill_summary_models = ( self.pipeline | 'Get all non-deleted skill summary models' >> (ndb_io.GetModels(skill_models.SkillSummaryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add skill summary ID' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda skill_summary_model: skill_summary_model.id)) migrated_skill_results = (unmigrated_skill_models | 'Transform and migrate model' >> beam.MapTuple(self._migrate_skill)) migrated_skills = ( migrated_skill_results | 'Filter oks' >> beam.Filter(lambda result_item: result_item.is_ok()) | 'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap())) migrated_skill_job_run_results = ( migrated_skill_results | 'Generate results for migration' >> (job_result_transforms.ResultsToJobRunResults('SKILL PROCESSED'))) skill_changes = (unmigrated_skill_models | 'Generate skill changes' >> beam.FlatMapTuple( self._generate_skill_changes)) skill_objects_list = ( { 'skill_model': unmigrated_skill_models, 'skill_summary_model': skill_summary_models, 'skill': migrated_skills, 'skill_changes': skill_changes } | 'Merge objects' >> beam.CoGroupByKey() | 'Get rid of ID' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Remove unmigrated skills' >> beam.Filter( lambda x: len(x['skill_changes']) > 0 and len(x['skill']) > 0) | 'Reorganize the skill objects' >> beam.Map( lambda objects: { 'skill_model': objects['skill_model'][0], 'skill_summary_model': objects['skill_summary_model'][0], 'skill': objects['skill'][0], 'skill_changes': objects['skill_changes'] })) skill_objects_list_job_run_results = ( skill_objects_list | 'Transform skill objects into job run results' >> (job_result_transforms.CountObjectsToJobRunResult('SKILL MIGRATED') )) cache_deletion_job_run_results = ( skill_objects_list | 'Delete skill from cache' >> beam.Map(lambda skill_object: self._delete_skill_from_cache( skill_object['skill'])) | 'Generate results for cache deletion' >> (job_result_transforms.ResultsToJobRunResults('CACHE DELETION'))) skill_models_to_put = ( skill_objects_list | 'Generate skill models to put' >> beam.FlatMap(lambda skill_objects: self._update_skill( skill_objects['skill_model'], skill_objects['skill'], skill_objects['skill_changes'], ))) skill_summary_models_to_put = ( skill_objects_list | 'Generate skill summary models to put' >> beam.Map(lambda skill_objects: self._update_skill_summary( skill_objects['skill'], skill_objects['skill_summary_model']))) unused_put_results = ( (skill_models_to_put, skill_summary_models_to_put) | 'Merge models' >> beam.Flatten() | 'Put models into the datastore' >> ndb_io.PutModels()) return ( (cache_deletion_job_run_results, migrated_skill_job_run_results, skill_objects_list_job_run_results) | beam.Flatten())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: deleted_user_ids_collection = ( self.pipeline | 'Get all deleted user models' >> ndb_io.GetModels( user_models.DeletedUserModel.get_all()) | 'Extract user IDs' >> beam.Map(lambda deleted_user_model: deleted_user_model.id)) deleted_user_ids = beam.pvalue.AsIter(deleted_user_ids_collection) sent_email_models_to_delete = ( self.pipeline | 'Get all sent email models' >> ndb_io.GetModels( email_models.SentEmailModel.get_all()) | 'Filter sent email models that belong to deleted users' >> (beam.Filter(lambda model, ids: (model.sender_id in ids or model.recipient_id in ids), ids=deleted_user_ids))) sent_email_models_to_delete_result = ( sent_email_models_to_delete | 'Count sent email models to be deleted' >> (job_result_transforms.CountObjectsToJobRunResult('SENT EMAILS'))) bulk_email_models_to_delete = ( self.pipeline | 'Get all bulk email models' >> ndb_io.GetModels( email_models.BulkEmailModel.get_all()) | 'Filter bulk email models that belong to deleted users' >> (beam.Filter(lambda model, ids: model.sender_id in ids, ids=deleted_user_ids))) bulk_email_models_to_delete_result = ( bulk_email_models_to_delete | 'Count bulk email models to be deleted' >> (job_result_transforms.CountObjectsToJobRunResult('BULK EMAILS'))) unsent_feedback_email_models_to_delete = ( self.pipeline | 'Get all unsent feedback models' >> ndb_io.GetModels( feedback_models.UnsentFeedbackEmailModel.get_all()) | 'Filter unsent feedback models that belong to deleted users' >> (beam.Filter(lambda model, ids: model.id in ids, ids=deleted_user_ids))) unsent_feedback_email_models_to_delete_result = ( unsent_feedback_email_models_to_delete | 'Count unsent feedback email models to be deleted' >> (job_result_transforms.CountObjectsToJobRunResult( 'FEEDBACK EMAILS'))) user_bulk_emails_models_to_delete = ( self.pipeline | 'Get all user bulk email models' >> ndb_io.GetModels( user_models.UserBulkEmailsModel.get_all()) | 'Filter user bulk email models that belong to deleted users' >> (beam.Filter(lambda model, ids: model.id in ids, ids=deleted_user_ids))) user_bulk_emails_models_to_delete_result = ( user_bulk_emails_models_to_delete | 'Count user bulk email models to be deleted' >> (job_result_transforms.CountObjectsToJobRunResult( 'USER BULK EMAILS'))) unused_models_deletion = ( (sent_email_models_to_delete, bulk_email_models_to_delete, unsent_feedback_email_models_to_delete, user_bulk_emails_models_to_delete) | 'Merge models' >> beam.Flatten() | 'Extract keys' >> beam.Map(lambda model: model.key) | 'Delete models' >> ndb_io.DeleteModels()) return (( sent_email_models_to_delete_result, bulk_email_models_to_delete_result, unsent_feedback_email_models_to_delete_result, user_bulk_emails_models_to_delete_result, ) | 'Merge results' >> beam.Flatten())