示例#1
0
            "Start fetching extracting links for stories_id %d topics_id %d" %
            (stories_id, topics_id))

        try:
            db = connect_to_db()
            story = db.require_by_id(table='stories', object_id=stories_id)
            topic = db.require_by_id(table='topics', object_id=topics_id)
            mediawords.tm.extract_story_links.extract_links_for_topic_story(
                db, story, topic)

        except Exception as ex:
            log.error("Error while processing story {}: {}".format(
                stories_id, ex))
            raise McExtractStoryLinksJobException(
                "Unable to process story {}: {}".format(
                    stories_id, traceback.format_exc()))

        log.info(
            "Finished fetching extracting links for stories_id %d topics_id %d"
            % (stories_id, topics_id))

    @classmethod
    def queue_name(cls) -> str:
        """Set queue name."""
        return 'MediaWords::Job::TM::ExtractStoryLinks'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=ExtractStoryLinksJob)
    app.start_worker()
示例#2
0
            snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

        if snapshots_id is None:
            raise McWord2vecGenerateSnapshotModelException(
                "'snapshots_id' is None.")

        snapshots_id = int(snapshots_id)

        db = connect_to_db()

        log.info("Generating word2vec model for snapshot %d..." % snapshots_id)

        sentence_iterator = SnapshotSentenceIterator(db=db,
                                                     snapshots_id=snapshots_id)
        model_store = SnapshotDatabaseModelStore(db=db,
                                                 snapshots_id=snapshots_id)
        train_word2vec_model(sentence_iterator=sentence_iterator,
                             model_store=model_store)

        log.info("Finished generating word2vec model for snapshot %d." %
                 snapshots_id)

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::Word2vec::GenerateSnapshotModel'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=Word2vecGenerateSnapshotModelJob)
    app.start_worker()
            mediawords.tm.fetch_twitter_urls.fetch_twitter_urls(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids)
        except Exception as ex:
            log.error("Error while fetching URL with ID {}: {}".format(topic_fetch_urls_ids, str(ex)))
            db.query(
                """
                update topic_fetch_urls set state = %(a)s, message = %(b)s, fetch_date = now()
                    where topic_fetch_urls_id = any(%(c)s)
                """,
                {
                    'a': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR,
                    'b': traceback.format_exc(),
                    'c': topic_fetch_urls_ids
                })

        db.disconnect()

        log.info("Finished fetching twitter url")

    @classmethod
    def queue_name(cls) -> str:
        """Set queue name."""
        return 'MediaWords::Job::TM::FetchTwitterUrls'


if __name__ == '__main__':
    try:
        app = JobBrokerApp(job_class=FetchTwitterUrlsJob)
        app.start_worker()
    except BaseException as e:
        print(str(e))
示例#4
0
                time.sleep(1)

        except Exception as ex:
            # all non throttled errors should get caught by the try: about, but catch again here just in case
            log.error("Error while fetching URL with ID {}: {}".format(
                topic_fetch_urls_id, str(ex)))
            cls._consecutive_requeues = 0
            update = {
                'state': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR,
                'fetch_date': datetime.datetime.now(),
                'message': traceback.format_exc(),
            }
            db.update_by_id('topic_fetch_urls', topic_fetch_urls_id, update)

        db.disconnect()

        log.info("Finished fetch for topic_fetch_url %d" % topic_fetch_urls_id)

    @classmethod
    def queue_name(cls) -> str:
        """Set queue name."""
        return 'MediaWords::Job::TM::FetchLink'


if __name__ == '__main__':
    try:
        app = JobBrokerApp(job_class=FetchLinkJob)
        app.start_worker()
    except BaseException as e:
        print(str(e))
示例#5
0
        log.info("Updating tags for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McNYTLabelsUpdateStoryTagsJobException(
                "Story with ID %d was not found." % stories_id)

        nytlabels = NYTLabelsAnnotator()
        try:
            nytlabels.update_tags_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McNYTLabelsUpdateStoryTagsJobException(
                "Unable to process story ID %d with NYTLabels: %s" % (
                    stories_id,
                    str(ex),
                ))

        log.info("Marking story ID %d as processed..." % stories_id)
        mark_as_processed(db=db, stories_id=stories_id)

        log.info("Finished updating tags for story ID %d" % stories_id)

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::NYTLabels::UpdateStoryTags'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=NYTLabelsUpdateStoryTagsJob)
    app.start_worker()
示例#6
0
        db = connect_to_db()

        log.info("Fetching annotation for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McCLIFFFetchAnnotationJobException(
                "Story with ID %d was not found." % stories_id)

        cliff = CLIFFAnnotator()
        try:
            cliff.annotate_and_store_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McCLIFFFetchAnnotationJobException(
                "Unable to process story $stories_id with CLIFF: %s" % str(ex))

        log.info("Adding story ID %d to the update story tags queue..." %
                 stories_id)
        CLIFFUpdateStoryTagsJob.add_to_queue(stories_id=stories_id)

        log.info("Finished fetching annotation for story ID %d" % stories_id)

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::CLIFF::FetchAnnotation'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=CLIFFFetchAnnotationJob)
    app.start_worker()
示例#7
0
        ExtractAndVectorJob._consecutive_requeues = 0

        log.info("Extracting story {}...".format(stories_id))

        db.begin()

        try:
            extractor_args = PyExtractorArguments(use_cache=use_cache)
            extract_and_process_story(db=db,
                                      story=story,
                                      extractor_args=extractor_args)

        except Exception as ex:
            raise McExtractAndVectorException(
                "Extractor died while extracting story {}: {}".format(
                    stories_id, ex))

        db.commit()

        log.info("Done extracting story {}.".format(stories_id))

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::ExtractAndVector'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=ExtractAndVectorJob)
    app.start_worker()
class FetchMediaPages(AbstractJob):
    """

    Fetch all media's pages (news stories and not) from XML sitemap.

    Start this worker script by running:

        ./script/run_in_env.sh ./mediacloud/mediawords/job/sitemap/fetch_media_pages.py

    """
    @classmethod
    def run_job(cls, media_id: int) -> None:
        if isinstance(media_id, bytes):
            media_id = decode_object_from_bytes_if_needed(media_id)

        media_id = int(media_id)

        db = connect_to_db()

        fetch_sitemap_pages_for_media_id(db=db, media_id=media_id)

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::Sitemap::FetchMediaPages'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=FetchMediaPages)
    app.start_worker()
示例#9
0
    Start this worker script by running:

        ./script/run_in_env.sh ./mediacloud/mediawords/job/similarweb/update_audience_data.py

    """
    @classmethod
    def run_job(cls, media_id: int) -> None:
        if isinstance(media_id, bytes):
            media_id = decode_object_from_bytes_if_needed(media_id)

        media_id = int(media_id)

        db = connect_to_db()
        similarweb_client = get_similarweb_client()

        log.info(
            "Collecting audience data for media ID {}...".format(media_id))
        update(db, media_id, similarweb_client)

        log.info("Finished collecting audience data for media ID {}".format(
            media_id))

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::SimilarWeb::UpdateAudienceData'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=SimilarWebUpdateAudienceDataJob)
    app.start_worker()
                db=db, topic_fetch_urls_ids=topic_fetch_urls_ids)
        except Exception as ex:
            log.error("Error while fetching URL with ID {}: {}".format(
                topic_fetch_urls_ids, str(ex)))
            db.query(
                """
                update topic_fetch_urls set state = %(a)s, message = %(b)s, fetch_date = now()
                    where topic_fetch_urls_id = any(%(c)s)
                """, {
                    'a': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR,
                    'b': traceback.format_exc(),
                    'c': topic_fetch_urls_ids
                })

        db.disconnect()

        log.info("Finished fetching twitter url")

    @classmethod
    def queue_name(cls) -> str:
        """Set queue name."""
        return 'MediaWords::Job::TM::FetchTwitterUrls'


if __name__ == '__main__':
    try:
        app = JobBrokerApp(job_class=FetchTwitterUrlsJob)
        app.start_worker()
    except BaseException as e:
        print(str(e))
示例#11
0
        log.info("Fetching annotation for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McNYTLabelsFetchAnnotationJobException(
                "Story with ID %d was not found." % stories_id)

        nytlabels = NYTLabelsAnnotator()
        try:
            nytlabels.annotate_and_store_for_story(db=db,
                                                   stories_id=stories_id)
        except Exception as ex:
            raise McNYTLabelsFetchAnnotationJobException(
                "Unable to process story $stories_id with NYTLabels: %s" %
                str(ex))

        log.info("Adding story ID %d to the update story tags queue..." %
                 stories_id)
        NYTLabelsUpdateStoryTagsJob.add_to_queue(stories_id=stories_id)

        log.info("Finished fetching annotation for story ID %d" % stories_id)

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::NYTLabels::FetchAnnotation'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=NYTLabelsFetchAnnotationJob)
    app.start_worker()
示例#12
0
        log.info("Updating tags for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McCLIFFUpdateStoryTagsJobException(
                "Story with ID %d was not found." % stories_id)

        cliff = CLIFFAnnotator()
        try:
            cliff.update_tags_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McCLIFFUpdateStoryTagsJobException(
                "Unable to process story ID %s with CLIFF: %s" % (
                    stories_id,
                    str(ex),
                ))

        log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
        NYTLabelsFetchAnnotationJob.add_to_queue(stories_id=stories_id)

        log.info("Finished updating tags for story ID %d" % stories_id)

    @classmethod
    def queue_name(cls) -> str:
        return 'MediaWords::Job::CLIFF::UpdateStoryTags'


if __name__ == '__main__':
    app = JobBrokerApp(job_class=CLIFFUpdateStoryTagsJob)
    app.start_worker()