log = create_logger(__name__) async def _start_workflow(stories_id: int) -> None: log.info(f"Starting a workflow for story {stories_id}...") client = workflow_client() workflow: PodcastTranscribeWorkflow = client.new_workflow_stub( cls=PodcastTranscribeWorkflow, workflow_options=WorkflowOptions(workflow_id=str(stories_id)), ) # Fire and forget as the workflow will do everything (including adding a extraction job) itself await WorkflowClient.start(workflow.transcribe_episode, stories_id) log.info(f"Started a workflow for story {stories_id}...") def run_podcast_fetch_episode(stories_id: int) -> None: if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) asyncio.run(_start_workflow(stories_id=stories_id)) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode') app.start_worker(handler=run_podcast_fetch_episode)
stories_id = decode_object_from_bytes_if_needed(stories_id) if stories_id is None: raise McCLIFFFetchAnnotationJobException("'stories_id' is None.") stories_id = int(stories_id) db = connect_to_db() log.info("Fetching annotation for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFFetchAnnotationJobException("Story with ID %d was not found." % stories_id) cliff = CLIFFAnnotatorFetcher() try: cliff.annotate_and_store_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFFetchAnnotationJobException("Unable to process story $stories_id with CLIFF: %s" % str(ex)) log.info("Adding story ID %d to the update story tags queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags').add_to_queue(stories_id=stories_id) log.info("Finished fetching annotation for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotation') app.start_worker(handler=run_cliff_fetch_annotation)
"""Generate word2vec model for a given snapshot.""" # MC_REWRITE_TO_PYTHON: remove after Python rewrite if isinstance(snapshots_id, bytes): snapshots_id = decode_object_from_bytes_if_needed(snapshots_id) if snapshots_id is None: raise McWord2vecGenerateSnapshotModelException( "'snapshots_id' is None.") snapshots_id = int(snapshots_id) db = connect_to_db() log.info("Generating word2vec model for snapshot %d..." % snapshots_id) sentence_iterator = SnapshotSentenceIterator(db=db, snapshots_id=snapshots_id) model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id) train_word2vec_model(sentence_iterator=sentence_iterator, model_store=model_store) log.info("Finished generating word2vec model for snapshot %d." % snapshots_id) if __name__ == '__main__': app = JobBroker( queue_name='MediaWords::Job::Word2vec::GenerateSnapshotModel') app.start_worker(handler=run_word2vec_generate_snapshot_model)
# FIXME could be passed as an argument topics_id = db.query(""" SELECT topics_id FROM timespans WHERE timespans_id = %(timespans_id)s """, { 'timespans_id': timespans_id, }).flat()[0] log.info(f"Generating maps for topic {topics_id}, timespan {timespans_id}") generate_and_store_maps( db=db, topics_id=topics_id, timespans_id=timespans_id, memory_limit_mb=_memory_limit_mb, ) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Run topics map worker.") parser.add_argument("-m", "--memory_limit_mb", type=int, required=True, help="Memory limit (MB) for Java subprocess") args = parser.parse_args() _memory_limit_mb = args.memory_limit_mb assert _memory_limit_mb, "Memory limit is not set (no idea what to set -Xmx to)." app = JobBroker(queue_name=QUEUE_NAME) app.start_worker(handler=run_job)
db=db, podcast_episode_transcript_fetches_id= podcast_episode_transcript_fetches_id, ) if stories_id: JobBroker( queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=stories_id) except McPodcastFetchTranscriptSoftException as ex: # Soft exceptions log.error( f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}" ) raise ex except Exception as ex: # Hard and other exceptions fatal_error((f"Fatal / unknown error while fetching transcript " f"for ID {podcast_episode_transcript_fetches_id}: {ex}")) log.info( f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}" ) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript') app.start_worker(handler=run_podcast_fetch_transcript)
db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McNYTLabelsTagsFromAnnotationJobException( "Story with ID %d was not found." % stories_id) nytlabels = NYTLabelsTagsFromAnnotation() try: nytlabels.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McNYTLabelsTagsFromAnnotationJobException( "Unable to process story ID %d with NYTLabels: %s" % ( stories_id, str(ex), )) log.info("Marking story ID %d as processed..." % stories_id) mark_as_processed(db=db, stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker( queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag') app.start_worker(handler=run_nytlabels_tags_from_annotation)
stories_id = int(stories_id) db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McNYTLabelsUpdateStoryTagsJobException( "Story with ID %d was not found." % stories_id) nytlabels = NYTLabelsTagger() try: nytlabels.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McNYTLabelsUpdateStoryTagsJobException( "Unable to process story ID %d with NYTLabels: %s" % ( stories_id, str(ex), )) log.info("Marking story ID %d as processed..." % stories_id) mark_as_processed(db=db, stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags') app.start_worker(handler=run_nytlabels_update_story_tags)
try: if not fetch_topic_url_update_state( db=db, topics_id=topics_id, topic_fetch_urls_id=topic_fetch_urls_id, domain_timeout=domain_timeout): JobBroker(queue_name=QUEUE_NAME).add_to_queue( topic_fetch_urls_id=topic_fetch_urls_id) _consecutive_requeues += 1 if _consecutive_requeues > REQUEUES_UNTIL_SLEEP: log.info("sleeping after %d consecutive retries ..." % _consecutive_requeues) time.sleep(1) except Exception as ex: # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the # "consecutive retries" here log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}") _consecutive_requeues = 0 log.info( f"Finished fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}" ) if __name__ == '__main__': app = JobBroker(queue_name=QUEUE_NAME) app.start_worker(handler=run_topics_fetch_link)
db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFTagsFromAnnotationJobException( "Story with ID %d was not found." % stories_id) cliff = CLIFFTagsFromAnnotation() try: cliff.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFTagsFromAnnotationJobException( "Unable to process story ID %s with CLIFF: %s" % ( stories_id, str(ex), )) log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag' ).add_to_queue(stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotationAndTag') app.start_worker(handler=run_cliff_tags_from_annotation)
if topics_id is None: raise McExtractStoryLinksJobException("'topics_id' is None.") stories_id = int(stories_id) topics_id = int(topics_id) db = connect_to_db() log.info("Start fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id)) try: extract_links_for_topic_story(db=db, stories_id=stories_id, topics_id=topics_id) except Exception as ex: log.error("Error while processing story {}: {}".format(stories_id, ex)) raise McExtractStoryLinksJobException( "Unable to process story {}: {}".format(stories_id, traceback.format_exc())) log.info( "Finished fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id)) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::TM::ExtractStoryLinks') app.start_worker(handler=run_topics_extract_story_links)
log = create_logger(__name__) class McFetchTwitterUrlsJobException(Exception): """Exceptions dealing with job setup and routing.""" pass def run_topics_fetch_twitter_urls(topic_fetch_urls_ids: list): """Fetch a set of twitter urls from the twitter api and add each as a topic story if it matches. All of the interesting logic is in mediawords.tm.fetch_twitter_urls.""" if topic_fetch_urls_ids is None: raise McFetchTwitterUrlsJobException("'topic_fetch_urls_ids' is None.") log.info("Start fetch twitter urls for %d topic_fetch_urls" % len(topic_fetch_urls_ids)) db = connect_to_db() fetch_twitter_urls_update_state(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids) log.info("Finished fetching twitter urls") if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::TM::FetchTwitterUrls') app.start_worker(handler=run_topics_fetch_twitter_urls)
JobBroker(queue_name=QUEUE_NAME).add_to_queue(stories_id=stories_id) return _consecutive_requeues = 0 log.info("Extracting story {}...".format(stories_id)) db.begin() try: extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing) extract_and_process_story(db=db, story=story, extractor_args=extractor_args) except Exception as ex: raise McExtractAndVectorException( "Extractor died while extracting story {}: {}".format( stories_id, ex)) db.commit() log.info("Done extracting story {}.".format(stories_id)) if __name__ == '__main__': app = JobBroker(queue_name=QUEUE_NAME) app.start_worker(handler=run_extract_and_vector)
log.info(f"Fetching story stats for story {stories_id}...") try: get_and_store_story_stats(db=db, story=story) except McFacebookSoftFailureException as ex: # On soft errors, just raise the exception further as we have reason to believe that the request will succeed on # other stories in the job queue log.error(f"Error while fetching stats for story {stories_id}: {ex}") raise ex except McFacebookHardFailureException as ex: # On hard errors, stop the whole worker as we most likely can't continue without a developer having a look into # what's happening fatal_error( f"Fatal error while fetching stats for story {stories_id}: {ex}") except Exception as ex: # On unknown exceptions, also go for sys.exit(1) as we don't really know what happened as they shouldn't be # thrown anyway fatal_error( f"Unknown exception while fetching stats for story {stories_id}: {ex}" ) log.info(f"Done fetching story stats for story {stories_id}.") if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Facebook::FetchStoryStats') app.start_worker(handler=run_facebook_fetch_story_stats)
if isinstance(test_id, bytes): test_id = decode_object_from_bytes_if_needed(test_id) if isinstance(x, bytes): x = decode_object_from_bytes_if_needed(x) if isinstance(y, bytes): y = decode_object_from_bytes_if_needed(y) test_id = int(test_id) x = int(x) y = int(y) log.info(f"Test ID {test_id}: adding {x} and {y}...") # In this time we should be able to add another job and make sure that it gets locked out from running time.sleep(10) return x + y if __name__ == '__main__': app = JobBroker(queue_name='TestPythonWorkerLock') app.start_worker( handler=run_job, lock=JobLock( lock_type='TestPythonWorkerLock', lock_arg='test_id', ), )
db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFUpdateStoryTagsJobException( "Story with ID %d was not found." % stories_id) cliff = CLIFFTagger() try: cliff.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFUpdateStoryTagsJobException( "Unable to process story ID %s with CLIFF: %s" % ( stories_id, str(ex), )) # log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id) # JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation').add_to_queue(stories_id=stories_id) log.info("Marking story ID %d as processed..." % stories_id) mark_as_processed(db=db, stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags') app.start_worker(handler=run_cliff_update_story_tags)
if stories_id is None: raise McNYTLabelsFetchAnnotationJobException("'stories_id' is None.") stories_id = int(stories_id) db = connect_to_db() log.info("Fetching annotation for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McNYTLabelsFetchAnnotationJobException("Story with ID %d was not found." % stories_id) nytlabels = NYTLabelsAnnotatorFetcher() try: nytlabels.annotate_and_store_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McNYTLabelsFetchAnnotationJobException( "Unable to process story $stories_id with NYTLabels: %s" % str(ex) ) log.info("Adding story ID %d to the update story tags queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags').add_to_queue(stories_id=stories_id) log.info("Finished fetching annotation for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation') app.start_worker(handler=run_nytlabels_fetch_annotation)
add_to_queue_at ) VALUES ( %(podcast_episodes_id)s, NOW() + INTERVAL %(add_to_queue_interval)s ) """, { 'podcast_episodes_id': episode.podcast_episodes_id, 'add_to_queue_interval': add_to_queue_interval, }) except McPodcastSubmitOperationSoftException as ex: # Soft exceptions log.error( f"Unable to submit podcast episode for story {stories_id}: {ex}") raise ex except Exception as ex: # Hard and other exceptions fatal_error( f"Fatal / unknown error while submitting podcast episode for story {stories_id}: {ex}" ) log.info( f"Done submitting story's {stories_id} podcast episode for transcription" ) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation') app.start_worker(handler=run_podcast_submit_operation)
#!/usr/bin/env python3 from mediawords.db import connect_to_db from mediawords.job import JobBroker from mediawords.util.log import create_logger from mediawords.util.perl import decode_object_from_bytes_if_needed from sitemap_fetch_media_pages.media import fetch_sitemap_pages_for_media_id log = create_logger(__name__) def run_sitemap_fetch_media_pages(media_id: int) -> None: """Fetch all media's pages (news stories and not) from XML sitemap.""" if isinstance(media_id, bytes): media_id = decode_object_from_bytes_if_needed(media_id) media_id = int(media_id) db = connect_to_db() fetch_sitemap_pages_for_media_id(db=db, media_id=media_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages') app.start_worker(handler=run_sitemap_fetch_media_pages)