示例#1
0
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Handle feeds of type 'web_page' by just creating a story to associate with the content.

        Web page feeds are feeds that consist of a web page that we download once a week and add as a story.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        feeds_id = download['feeds_id']

        feed = db.find_by_id(table='feeds', object_id=feeds_id)

        title = html_title(html=content, fallback='(no title)')
        title += '[' + sql_now() + ']'

        guid = f"{str(int(time.time()))}:{download['url']}"[0:1024]

        new_story = {
            'url': download['url'],
            'guid': guid,
            'media_id': feed['media_id'],
            'publish_date': sql_now(),
            'title': title,
        }

        story = add_story(db=db, story=new_story, feeds_id=feeds_id)
        if not story:
            raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}")

        db.query(
            """
            UPDATE downloads
            SET stories_id = %(stories_id)s,
                type = 'content'
            WHERE downloads_id = %(downloads_id)s
        """, {
                'stories_id': story['stories_id'],
                'downloads_id': download['downloads_id'],
            })

        # A webpage that was just fetched is also a story
        story_ids = [
            story['stories_id'],
        ]

        return story_ids
示例#2
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        try:
            db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)
        except McTupleAlreadyMovedError as ex:
            # Some attempts to set the download's row to "fetching" fail with:
            #
            #   "tuple to be locked was already moved to another partition due to concurrent update"
            #
            # If that happens, we assume that some other fetcher instance somehow got to the download first and do
            # nothing
            log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}")
            return None
        except Exception as ex:
            # Raise further on misc. errors
            raise ex

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
示例#3
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = f"{user['name']} ({user['screen_name']}): {user['description']}"
    title = f"{user['name']} ({user['screen_name']}) | Twitter"
    tweet_date = sql_now()
    url = f"https://twitter.com/{user['screen_name']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)

    stories_id = story['stories_id']
    tags_id = undateable_tag['tags_id']

    db.query("""
        INSERT INTO public.stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
        ON CONFLICT (stories_id, tags_id) DO NOTHING
    """, {
        'stories_id': stories_id,
        'tags_id': tags_id,
    })

    return story
示例#4
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'],
                                             'logs', 'http_request.log')

        with open(http_request_log_path, 'a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (
                sql_now(),
                url,
            ))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (
                http_request_log_path,
                str(ex),
            ))
            pass
示例#5
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        log.debug("HTTP request: %s %s\n" % (sql_now(), url,))
示例#6
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)

        ua = UserAgent()
        url_with_credentials = self._api_request_url_with_signature_from_config(api_url=download['url'])
        request = Request(method='GET', url=url_with_credentials)
        response = ua.request(request)

        return response
示例#7
0
    def update_job_state_message(self, db: DatabaseHandler, message: str) -> None:
        """
        Update the message field for the current "job_states" row.

        This is a public method that is intended to be used by code run anywhere above the stack from run() to publish
        messages updating the progress of a long running job.
        """
        message = decode_object_from_bytes_if_needed(message)

        # Verify that it exists I guess?
        db.require_by_id(table='job_states', object_id=self.__job_states_id)

        job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'message': message,
            'last_updated': sql_now(),
        })

        self.__update_table_state(db=db, job_state=job_state)
示例#8
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads',
                        object_id=download['downloads_id'],
                        update_hash=download)

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
示例#9
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log')

        with open(http_request_log_path, encoding='utf-8', mode='a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (sql_now(), url,))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),))
            pass
示例#10
0
    def update_job_state(self, db: DatabaseHandler, state: str, message: Optional[str] = ''):
        """
        Update the state and message fields of the "job_states" table for the currently active "job_states_id".

        "jobs_states_id" is set and unset in method run() below, so this must be called from code running from within
        the run() implementation of the subclass.
        """
        state = decode_object_from_bytes_if_needed(state)
        message = decode_object_from_bytes_if_needed(message)

        log.debug(f"{self.__queue_name} state: {state}")

        job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'state': state,
            'last_updated': sql_now(),
            'message': message,
        })

        self.__update_table_state(db=db, job_state=job_state)
示例#11
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description'])
    title = '%s (%s) | Twitter' % (user['name'], user['screen_name'])
    tweet_date = sql_now()
    url = 'https://twitter.com/%s' % user['screen_name']

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': undateable_tag['tags_id']})

    return story
示例#12
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = db.create(
        'stories',
        {
            'url': old_story['url'],
            'media_id': new_medium['media_id'],
            'guid': old_story['guid'],
            'publish_date': old_story['publish_date'],
            'collect_date': sql_now(),
            'description': old_story['description'],
            'title': old_story['title']
        },
    )
    add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True)

    for old_story_tag in db.query(
        """
        SELECT tags_id
        FROM stories_tags_map
        WHERE stories_id = %(stories_id)s
        ORDER BY tags_id
        """,
        {'stories_id': old_story['stories_id']},
    ).hashes():
        stories_id = story['stories_id']
        tags_id = old_story_tag['tags_id']

        db.query("""
            INSERT INTO stories_tags_map (stories_id, tags_id)
            VALUES (%(stories_id)s, %(tags_id)s)
            ON CONFLICT (stories_id, tags_id) DO NOTHING
        """, {
            'stories_id': stories_id,
            'tags_id': tags_id,
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']})

    old_download = db.query(
        """
            SELECT *
            FROM downloads
            WHERE stories_id = %(stories_id)s
            ORDER BY downloads_id
            LIMIT 1
        """,
        {
            'stories_id': old_story['stories_id'],
        }
    ).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']])
            db.update_by_id('downloads', download['downloads_id'], download_update)

        db.query(
            """
                INSERT INTO download_texts (
                    downloads_id,
                    download_text,
                    download_text_length
                )
                    SELECT
                        %(downloads_id)s,
                        dt.download_text,
                        dt.download_text_length
                    FROM download_texts AS dt
                    WHERE dt.downloads_id = %(downloads_id)s
            """,
            {
                'downloads_id': download['downloads_id'],
            },
        )

    # noinspection SqlInsertValues
    db.query(
        """
            INSERT INTO story_sentences (
                stories_id,
                sentence_number,
                sentence,
                media_id,
                publish_date,
                language
            )
                SELECT
                    %(new_stories_id)s,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date,
                    language
                FROM story_sentences
                WHERE stories_id = %(old_stories_id)s
        """,
        {
            'old_stories_id': old_story['stories_id'],
            'new_stories_id': int(story['stories_id']),
        },
    )

    return story
示例#13
0
def test_cliff_tagger():
    db = connect_to_db()

    media = db.create(table='media',
                      insert_hash={
                          'name': "test medium",
                          'url': "url://test/medium",
                      })

    story = db.create(table='stories',
                      insert_hash={
                          'media_id': media['media_id'],
                          'url': 'url://story/a',
                          'guid': 'guid://story/a',
                          'title': 'story a',
                          'description': 'description a',
                          'publish_date': sql_now(),
                          'collect_date': sql_now(),
                          'full_text_rss': True,
                      })
    stories_id = story['stories_id']

    db.create(table='story_sentences',
              insert_hash={
                  'stories_id': stories_id,
                  'sentence_number': 1,
                  'sentence': 'I hope that the CLIFF annotator is working.',
                  'media_id': media['media_id'],
                  'publish_date': sql_now(),
                  'language': 'en'
              })

    store = CLIFFAnnotatorStore()
    store.store_annotation_for_story(db=db,
                                     stories_id=stories_id,
                                     annotation=sample_cliff_response())

    cliff = CLIFFTagger()
    cliff.update_tags_for_story(db=db, stories_id=stories_id)

    story_tags = db.query(
        """
        SELECT
            tags.tag AS tags_name,
            tags.label AS tags_label,
            tags.description AS tags_description,
            tag_sets.name AS tag_sets_name,
            tag_sets.label AS tag_sets_label,
            tag_sets.description AS tag_sets_description
        FROM stories_tags_map
            INNER JOIN tags
                ON stories_tags_map.tags_id = tags.tags_id
            INNER JOIN tag_sets
                ON tags.tag_sets_id = tag_sets.tag_sets_id
        WHERE stories_tags_map.stories_id = %(stories_id)s
        ORDER BY
            lower(tag_sets.name),
            lower(tags.tag)
    """, {
            'stories_id': stories_id
        }).hashes()

    expected_tags = expected_cliff_tags()

    assert story_tags == expected_tags
示例#14
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict,
                             new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db,
                         story=story,
                         topic=topic,
                         valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': old_story['stories_id']
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id']
    })

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': old_story['stories_id']
        }).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([
                (f, old_download[f])
                for f in ['state', 'error_message', 'download_time']
            ])
            db.update_by_id('downloads', download['downloads_id'],
                            download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """, {'a': download['downloads_id']})

    # noinspection SqlInsertValues
    db.query(
        f"""
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """, {'b': old_story['stories_id']})

    return story
示例#15
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media',
                                 insert_hash={
                                     'name': "test medium",
                                     'url': "url://test/medium",
                                 })

        story = self.db().create(table='stories',
                                 insert_hash={
                                     'media_id': media['media_id'],
                                     'url': 'url://story/a',
                                     'guid': 'guid://story/a',
                                     'title': 'story a',
                                     'description': 'description a',
                                     'publish_date': sql_now(),
                                     'collect_date': sql_now(),
                                     'full_text_rss': True,
                                 })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences',
                         insert_hash={
                             'stories_id': stories_id,
                             'sentence_number': 1,
                             'sentence':
                             'I hope that the CLIFF annotator is working.',
                             'media_id': media['media_id'],
                             'publish_date': sql_now(),
                             'language': 'en'
                         })

        def __nyt_labels_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(),
                                               stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query(
            """
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {
                'object_id': stories_id
            }).hash()
        assert annotation_exists is not None

        story_tags = self.db().query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
def test_add_stale_feeds():
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')

    pending_feeds = []

    feed = {
        'media_id': medium['media_id'],
        'name': 'null last download',
        'url': 'http://null last download',
        'type': 'syndicated',
        'active': True,
        'last_attempted_download_time': None
    }
    feed = db.create('feeds', feed)
    pending_feeds.append(feed)

    feed = {
        'media_id': medium['media_id'],
        'name': 'recent last download',
        'url': 'http://recent last download',
        'type': 'syndicated',
        'active': True,
        'last_attempted_download_time': sql_now()
    }
    db.create('feeds', feed)

    feed = {
        'media_id': medium['media_id'],
        'name': 'recent last new story',
        'url': 'http://recent last new story',
        'type': 'syndicated',
        'active': True,
        'last_attempted_download_time': sql_now(),
        'last_new_story_time': sql_now()
    }
    db.create('feeds', feed)

    feed = {
        'media_id':
        medium['media_id'],
        'name':
        '5 minute new story',
        'url':
        'http://5 minute new story',
        'type':
        'syndicated',
        'active':
        True,
        'last_attempted_download_time':
        get_sql_date_from_epoch(int(time.time()) - 300),
        'last_new_story_time':
        get_sql_date_from_epoch(int(time.time()) - 300),
    }
    feed = db.create('feeds', feed)
    pending_feeds.append(feed)

    feed = {
        'media_id':
        medium['media_id'],
        'name':
        'old last download',
        'url':
        'http://old last download',
        'type':
        'syndicated',
        'active':
        True,
        'last_attempted_download_time':
        get_sql_date_from_epoch(int(time.time()) - (86400 * 10))
    }
    feed = db.create('feeds', feed)
    pending_feeds.append(feed)

    _add_stale_feeds(db)

    num_pending_downloads = db.query(
        "select count(*) from downloads where state = 'pending'").flat()[0]
    assert num_pending_downloads == len(pending_feeds)

    for feed in pending_feeds:
        exists = db.query(
            "select * from downloads where state = 'pending' and feeds_id = %(a)s",
            {
                'a': feed['feeds_id']
            }).hash()
        assert exists, "download for feed %s added" % feed['name']
示例#17
0
def test_cliff_annotator():

    db = connect_to_db()

    media = db.create(table='media', insert_hash={
        'name': "test medium",
        'url': "url://test/medium",
    })

    story = db.create(table='stories', insert_hash={
        'media_id': media['media_id'],
        'url': 'url://story/a',
        'guid': 'guid://story/a',
        'title': 'story a',
        'description': 'description a',
        'publish_date': sql_now(),
        'collect_date': sql_now(),
        'full_text_rss': True,
    })
    stories_id = story['stories_id']

    db.create(table='story_sentences', insert_hash={
        'stories_id': stories_id,
        'sentence_number': 1,
        'sentence': 'I hope that the CLIFF annotator is working.',
        'media_id': media['media_id'],
        'publish_date': sql_now(),
        'language': 'en'
    })

    def __cliff_sample_response(_: HashServer.Request) -> Union[str, bytes]:
        """Mock annotator."""
        response = ""
        response += "HTTP/1.0 200 OK\r\n"
        response += "Content-Type: application/json; charset=UTF-8\r\n"
        response += "\r\n"
        response += encode_json(sample_cliff_response())
        return response

    pages = {
        '/cliff/parse/text': {
            'callback': __cliff_sample_response,
        }
    }

    port = random_unused_port()
    annotator_url = 'http://localhost:%d/cliff/parse/text' % port

    hs = HashServer(port=port, pages=pages)
    hs.start()

    class TestCLIFFFetcherConfig(CLIFFFetcherConfig):
        @staticmethod
        def annotator_url() -> str:
            return annotator_url

    cliff = CLIFFAnnotatorFetcher(fetcher_config=TestCLIFFFetcherConfig())
    cliff.annotate_and_store_for_story(db=db, stories_id=stories_id)

    hs.stop()

    annotation_exists = db.query("""
        SELECT 1
        FROM cliff_annotations
        WHERE object_id = %(object_id)s
    """, {'object_id': stories_id}).hash()
    assert annotation_exists is not None
示例#18
0
def test_sql_now():
    assert sql_now() == datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
示例#19
0
    def test_add_stale_feeds(self) -> None:
        """Test _add_stale_feeds()."""
        db = self.db()

        medium = mediawords.test.db.create.create_test_medium(db, 'foo')

        pending_feeds = []

        feed = {
            'media_id': medium['media_id'],
            'name': 'null last download',
            'url': 'http://null last download',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': None
        }
        feed = db.create('feeds', feed)
        pending_feeds.append(feed)

        feed = {
            'media_id': medium['media_id'],
            'name': 'recent last download',
            'url': 'http://recent last download',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': sql_now()
        }
        feed = db.create('feeds', feed)

        feed = {
            'media_id': medium['media_id'],
            'name': 'recent last new story',
            'url': 'http://recent last new story',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': sql_now(),
            'last_new_story_time': sql_now()
        }
        feed = db.create('feeds', feed)

        feed = {
            'media_id': medium['media_id'],
            'name': '5 minute new story',
            'url': 'http://5 minute new story',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': get_sql_date_from_epoch(time.time() - 300),
            'last_new_story_time': get_sql_date_from_epoch(time.time() - 300),
        }
        feed = db.create('feeds', feed)
        pending_feeds.append(feed)

        feed = {
            'media_id': medium['media_id'],
            'name': 'old last download',
            'url': 'http://old last download',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': get_sql_date_from_epoch(time.time() - (86400 * 10))
        }
        feed = db.create('feeds', feed)
        pending_feeds.append(feed)

        mediawords.crawler.provider._add_stale_feeds(db)

        num_pending_downloads = db.query("select count(*) from downloads where state = 'pending'").flat()[0]
        assert num_pending_downloads == len(pending_feeds)

        for feed in pending_feeds:
            exists = db.query(
                "select * from downloads where state = 'pending' and feeds_id = %(a)s",
                {'a': feed['feeds_id']}).hash()
            assert exists, "download for feed %s added" % feed['name']
示例#20
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media', insert_hash={
            'name': "test medium",
            'url': "url://test/medium",
        })

        story = self.db().create(table='stories', insert_hash={
            'media_id': media['media_id'],
            'url': 'url://story/a',
            'guid': 'guid://story/a',
            'title': 'story a',
            'description': 'description a',
            'publish_date': sql_now(),
            'collect_date': sql_now(),
            'full_text_rss': True,
        })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences', insert_hash={
            'stories_id': stories_id,
            'sentence_number': 1,
            'sentence': 'I hope that the CLIFF annotator is working.',
            'media_id': media['media_id'],
            'publish_date': sql_now(),
            'language': 'en'
        })

        def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query("""
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {'object_id': stories_id}).hash()
        assert annotation_exists is not None

        story_tags = self.db().query("""
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {'stories_id': stories_id}).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
示例#21
0
    def test_tagging(self):
        db = connect_to_db()

        media = db.create(table='media',
                          insert_hash={
                              'name': "test medium",
                              'url': "url://test/medium",
                          })

        story = db.create(table='stories',
                          insert_hash={
                              'media_id': media['media_id'],
                              'url': 'url://story/a',
                              'guid': 'guid://story/a',
                              'title': 'story a',
                              'description': 'description a',
                              'publish_date': sql_now(),
                              'collect_date': sql_now(),
                              'full_text_rss': True,
                          })
        stories_id = story['stories_id']

        db.create(table='story_sentences',
                  insert_hash={
                      'stories_id': stories_id,
                      'sentence_number': 1,
                      'sentence':
                      'I hope that the CLIFF annotator is working.',
                      'media_id': media['media_id'],
                      'publish_date': sql_now(),
                      'language': 'en'
                  })

        def __cliff_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(sample_cliff_response())
            return response

        pages = {
            '/cliff/parse/text': {
                'callback': __cliff_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/cliff/parse/text' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        class TestCLIFFFetcherConfig(CLIFFTagsFromAnnotationConfig):
            @staticmethod
            def annotator_url() -> str:
                return annotator_url

        cliff = CLIFFTagsFromAnnotation(tagger_config=TestCLIFFFetcherConfig())
        cliff.update_tags_for_story(db=db, stories_id=stories_id)

        hs.stop()

        story_tags = db.query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY
                lower(tag_sets.name),
                lower(tags.tag)
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = expected_cliff_tags()

        assert story_tags == expected_tags
    def test_nyt_labels_annotator(self):

        db = connect_to_db()

        media = db.create(table='media', insert_hash={
            'name': "test medium",
            'url': "url://test/medium",
        })

        story = db.create(table='stories', insert_hash={
            'media_id': media['media_id'],
            'url': 'url://story/a',
            'guid': 'guid://story/a',
            'title': 'story a',
            'description': 'description a',
            'publish_date': sql_now(),
            'collect_date': sql_now(),
            'full_text_rss': True,
        })
        stories_id = story['stories_id']

        db.create(table='story_sentences', insert_hash={
            'stories_id': stories_id,
            'sentence_number': 1,
            'sentence': 'I hope that the CLIFF annotator is working.',
            'media_id': media['media_id'],
            'publish_date': sql_now(),
            'language': 'en'
        })

        store = NYTLabelsAnnotatorStore()
        store.store_annotation_for_story(db=db, stories_id=stories_id, annotation=sample_nytlabels_response())

        nytlabels = NYTLabelsTagger()
        nytlabels.update_tags_for_story(db=db, stories_id=stories_id)

        annotation_exists = db.query("""
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {'object_id': stories_id}).hash()
        assert annotation_exists is not None

        story_tags = db.query("""
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {'stories_id': stories_id}).hashes()

        expected_tags = expected_nytlabels_tags()

        assert story_tags == expected_tags
示例#23
0
def test_sql_now():
    assert sql_now() == datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')