Exemplo n.º 1
0
log = create_logger(__name__)


async def _start_workflow(stories_id: int) -> None:
    log.info(f"Starting a workflow for story {stories_id}...")

    client = workflow_client()
    workflow: PodcastTranscribeWorkflow = client.new_workflow_stub(
        cls=PodcastTranscribeWorkflow,
        workflow_options=WorkflowOptions(workflow_id=str(stories_id)),
    )

    # Fire and forget as the workflow will do everything (including adding a extraction job) itself
    await WorkflowClient.start(workflow.transcribe_episode, stories_id)

    log.info(f"Started a workflow for story {stories_id}...")


def run_podcast_fetch_episode(stories_id: int) -> None:
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    asyncio.run(_start_workflow(stories_id=stories_id))


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode')
    app.start_worker(handler=run_podcast_fetch_episode)
        stories_id = decode_object_from_bytes_if_needed(stories_id)

    if stories_id is None:
        raise McCLIFFFetchAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Fetching annotation for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFFetchAnnotationJobException("Story with ID %d was not found." % stories_id)

    cliff = CLIFFAnnotatorFetcher()
    try:
        cliff.annotate_and_store_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFFetchAnnotationJobException("Unable to process story $stories_id with CLIFF: %s" % str(ex))

    log.info("Adding story ID %d to the update story tags queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags').add_to_queue(stories_id=stories_id)

    log.info("Finished fetching annotation for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotation')
    app.start_worker(handler=run_cliff_fetch_annotation)
Exemplo n.º 3
0
    """Generate word2vec model for a given snapshot."""

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(snapshots_id, bytes):
        snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

    if snapshots_id is None:
        raise McWord2vecGenerateSnapshotModelException(
            "'snapshots_id' is None.")

    snapshots_id = int(snapshots_id)

    db = connect_to_db()

    log.info("Generating word2vec model for snapshot %d..." % snapshots_id)

    sentence_iterator = SnapshotSentenceIterator(db=db,
                                                 snapshots_id=snapshots_id)
    model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id)
    train_word2vec_model(sentence_iterator=sentence_iterator,
                         model_store=model_store)

    log.info("Finished generating word2vec model for snapshot %d." %
             snapshots_id)


if __name__ == '__main__':
    app = JobBroker(
        queue_name='MediaWords::Job::Word2vec::GenerateSnapshotModel')
    app.start_worker(handler=run_word2vec_generate_snapshot_model)
Exemplo n.º 4
0
        # FIXME could be passed as an argument
        topics_id = db.query("""
            SELECT topics_id
            FROM timespans
            WHERE timespans_id = %(timespans_id)s
        """, {
            'timespans_id': timespans_id,
        }).flat()[0]

        log.info(f"Generating maps for topic {topics_id}, timespan {timespans_id}")
        generate_and_store_maps(
            db=db,
            topics_id=topics_id,
            timespans_id=timespans_id,
            memory_limit_mb=_memory_limit_mb,
        )


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run topics map worker.")
    parser.add_argument("-m", "--memory_limit_mb", type=int, required=True,
                        help="Memory limit (MB) for Java subprocess")
    args = parser.parse_args()

    _memory_limit_mb = args.memory_limit_mb
    assert _memory_limit_mb, "Memory limit is not set (no idea what to set -Xmx to)."

    app = JobBroker(queue_name=QUEUE_NAME)
    app.start_worker(handler=run_job)
            db=db,
            podcast_episode_transcript_fetches_id=
            podcast_episode_transcript_fetches_id,
        )

        if stories_id:
            JobBroker(
                queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
                    stories_id=stories_id)

    except McPodcastFetchTranscriptSoftException as ex:
        # Soft exceptions
        log.error(
            f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}"
        )
        raise ex

    except Exception as ex:
        # Hard and other exceptions
        fatal_error((f"Fatal / unknown error while fetching transcript "
                     f"for ID {podcast_episode_transcript_fetches_id}: {ex}"))

    log.info(
        f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}"
    )


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript')
    app.start_worker(handler=run_podcast_fetch_transcript)
    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McNYTLabelsTagsFromAnnotationJobException(
            "Story with ID %d was not found." % stories_id)

    nytlabels = NYTLabelsTagsFromAnnotation()
    try:
        nytlabels.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McNYTLabelsTagsFromAnnotationJobException(
            "Unable to process story ID %d with NYTLabels: %s" % (
                stories_id,
                str(ex),
            ))

    log.info("Marking story ID %d as processed..." % stories_id)
    mark_as_processed(db=db, stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(
        queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag')
    app.start_worker(handler=run_nytlabels_tags_from_annotation)
Exemplo n.º 7
0
    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McNYTLabelsUpdateStoryTagsJobException(
            "Story with ID %d was not found." % stories_id)

    nytlabels = NYTLabelsTagger()
    try:
        nytlabels.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McNYTLabelsUpdateStoryTagsJobException(
            "Unable to process story ID %d with NYTLabels: %s" % (
                stories_id,
                str(ex),
            ))

    log.info("Marking story ID %d as processed..." % stories_id)
    mark_as_processed(db=db, stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags')
    app.start_worker(handler=run_nytlabels_update_story_tags)
    try:
        if not fetch_topic_url_update_state(
                db=db,
                topics_id=topics_id,
                topic_fetch_urls_id=topic_fetch_urls_id,
                domain_timeout=domain_timeout):
            JobBroker(queue_name=QUEUE_NAME).add_to_queue(
                topic_fetch_urls_id=topic_fetch_urls_id)

            _consecutive_requeues += 1
            if _consecutive_requeues > REQUEUES_UNTIL_SLEEP:
                log.info("sleeping after %d consecutive retries ..." %
                         _consecutive_requeues)
                time.sleep(1)

    except Exception as ex:
        # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the
        # "consecutive retries" here
        log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}")
        _consecutive_requeues = 0

    log.info(
        f"Finished fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}"
    )


if __name__ == '__main__':
    app = JobBroker(queue_name=QUEUE_NAME)
    app.start_worker(handler=run_topics_fetch_link)
    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFTagsFromAnnotationJobException(
            "Story with ID %d was not found." % stories_id)

    cliff = CLIFFTagsFromAnnotation()
    try:
        cliff.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFTagsFromAnnotationJobException(
            "Unable to process story ID %s with CLIFF: %s" % (
                stories_id,
                str(ex),
            ))

    log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag'
              ).add_to_queue(stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotationAndTag')
    app.start_worker(handler=run_cliff_tags_from_annotation)
Exemplo n.º 10
0
    if topics_id is None:
        raise McExtractStoryLinksJobException("'topics_id' is None.")

    stories_id = int(stories_id)
    topics_id = int(topics_id)

    db = connect_to_db()

    log.info("Start fetching extracting links for stories_id %d topics_id %d" %
             (stories_id, topics_id))

    try:
        extract_links_for_topic_story(db=db,
                                      stories_id=stories_id,
                                      topics_id=topics_id)

    except Exception as ex:
        log.error("Error while processing story {}: {}".format(stories_id, ex))
        raise McExtractStoryLinksJobException(
            "Unable to process story {}: {}".format(stories_id,
                                                    traceback.format_exc()))

    log.info(
        "Finished fetching extracting links for stories_id %d topics_id %d" %
        (stories_id, topics_id))


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::TM::ExtractStoryLinks')
    app.start_worker(handler=run_topics_extract_story_links)
Exemplo n.º 11
0
log = create_logger(__name__)


class McFetchTwitterUrlsJobException(Exception):
    """Exceptions dealing with job setup and routing."""
    pass


def run_topics_fetch_twitter_urls(topic_fetch_urls_ids: list):
    """Fetch a set of twitter urls from the twitter api and add each as a topic story if it matches.

    All of the interesting logic is in mediawords.tm.fetch_twitter_urls."""
    if topic_fetch_urls_ids is None:
        raise McFetchTwitterUrlsJobException("'topic_fetch_urls_ids' is None.")

    log.info("Start fetch twitter urls for %d topic_fetch_urls" %
             len(topic_fetch_urls_ids))

    db = connect_to_db()

    fetch_twitter_urls_update_state(db=db,
                                    topic_fetch_urls_ids=topic_fetch_urls_ids)

    log.info("Finished fetching twitter urls")


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::TM::FetchTwitterUrls')
    app.start_worker(handler=run_topics_fetch_twitter_urls)
        JobBroker(queue_name=QUEUE_NAME).add_to_queue(stories_id=stories_id)

        return

    _consecutive_requeues = 0

    log.info("Extracting story {}...".format(stories_id))

    db.begin()

    try:
        extractor_args = PyExtractorArguments(use_cache=use_cache,
                                              use_existing=use_existing)
        extract_and_process_story(db=db,
                                  story=story,
                                  extractor_args=extractor_args)

    except Exception as ex:
        raise McExtractAndVectorException(
            "Extractor died while extracting story {}: {}".format(
                stories_id, ex))

    db.commit()

    log.info("Done extracting story {}.".format(stories_id))


if __name__ == '__main__':
    app = JobBroker(queue_name=QUEUE_NAME)
    app.start_worker(handler=run_extract_and_vector)
    log.info(f"Fetching story stats for story {stories_id}...")

    try:
        get_and_store_story_stats(db=db, story=story)

    except McFacebookSoftFailureException as ex:
        # On soft errors, just raise the exception further as we have reason to believe that the request will succeed on
        # other stories in the job queue
        log.error(f"Error while fetching stats for story {stories_id}: {ex}")
        raise ex

    except McFacebookHardFailureException as ex:
        # On hard errors, stop the whole worker as we most likely can't continue without a developer having a look into
        # what's happening
        fatal_error(
            f"Fatal error while fetching stats for story {stories_id}: {ex}")

    except Exception as ex:
        # On unknown exceptions, also go for sys.exit(1) as we don't really know what happened as they shouldn't be
        # thrown anyway
        fatal_error(
            f"Unknown exception while fetching stats for story {stories_id}: {ex}"
        )

    log.info(f"Done fetching story stats for story {stories_id}.")


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Facebook::FetchStoryStats')
    app.start_worker(handler=run_facebook_fetch_story_stats)
Exemplo n.º 14
0
    if isinstance(test_id, bytes):
        test_id = decode_object_from_bytes_if_needed(test_id)

    if isinstance(x, bytes):
        x = decode_object_from_bytes_if_needed(x)

    if isinstance(y, bytes):
        y = decode_object_from_bytes_if_needed(y)

    test_id = int(test_id)
    x = int(x)
    y = int(y)

    log.info(f"Test ID {test_id}: adding {x} and {y}...")

    # In this time we should be able to add another job and make sure that it gets locked out from running
    time.sleep(10)

    return x + y


if __name__ == '__main__':
    app = JobBroker(queue_name='TestPythonWorkerLock')
    app.start_worker(
        handler=run_job,
        lock=JobLock(
            lock_type='TestPythonWorkerLock',
            lock_arg='test_id',
        ),
    )
    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFUpdateStoryTagsJobException(
            "Story with ID %d was not found." % stories_id)

    cliff = CLIFFTagger()
    try:
        cliff.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFUpdateStoryTagsJobException(
            "Unable to process story ID %s with CLIFF: %s" % (
                stories_id,
                str(ex),
            ))

    # log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
    # JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation').add_to_queue(stories_id=stories_id)
    log.info("Marking story ID %d as processed..." % stories_id)
    mark_as_processed(db=db, stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags')
    app.start_worker(handler=run_cliff_update_story_tags)
Exemplo n.º 16
0
    if stories_id is None:
        raise McNYTLabelsFetchAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Fetching annotation for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McNYTLabelsFetchAnnotationJobException("Story with ID %d was not found." % stories_id)

    nytlabels = NYTLabelsAnnotatorFetcher()
    try:
        nytlabels.annotate_and_store_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McNYTLabelsFetchAnnotationJobException(
            "Unable to process story $stories_id with NYTLabels: %s" % str(ex)
        )

    log.info("Adding story ID %d to the update story tags queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags').add_to_queue(stories_id=stories_id)

    log.info("Finished fetching annotation for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation')
    app.start_worker(handler=run_nytlabels_fetch_annotation)
Exemplo n.º 17
0
                add_to_queue_at
            ) VALUES (
                %(podcast_episodes_id)s,
                NOW() + INTERVAL %(add_to_queue_interval)s
            )
        """, {
                'podcast_episodes_id': episode.podcast_episodes_id,
                'add_to_queue_interval': add_to_queue_interval,
            })

    except McPodcastSubmitOperationSoftException as ex:
        # Soft exceptions
        log.error(
            f"Unable to submit podcast episode for story {stories_id}: {ex}")
        raise ex

    except Exception as ex:
        # Hard and other exceptions
        fatal_error(
            f"Fatal / unknown error while submitting podcast episode for story {stories_id}: {ex}"
        )

    log.info(
        f"Done submitting story's {stories_id} podcast episode for transcription"
    )


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation')
    app.start_worker(handler=run_podcast_submit_operation)
Exemplo n.º 18
0
#!/usr/bin/env python3

from mediawords.db import connect_to_db
from mediawords.job import JobBroker
from mediawords.util.log import create_logger
from mediawords.util.perl import decode_object_from_bytes_if_needed
from sitemap_fetch_media_pages.media import fetch_sitemap_pages_for_media_id

log = create_logger(__name__)


def run_sitemap_fetch_media_pages(media_id: int) -> None:
    """Fetch all media's pages (news stories and not) from XML sitemap."""
    if isinstance(media_id, bytes):
        media_id = decode_object_from_bytes_if_needed(media_id)

    media_id = int(media_id)

    db = connect_to_db()

    fetch_sitemap_pages_for_media_id(db=db, media_id=media_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages')
    app.start_worker(handler=run_sitemap_fetch_media_pages)