Exemplo n.º 1
0
def test_get_sql_date_from_epoch():
    assert get_sql_date_from_epoch(int(
        time.time())) == datetime.datetime.today().strftime(
            '%Y-%m-%d %H:%M:%S')
    assert get_sql_date_from_epoch(0) == datetime.datetime.fromtimestamp(
        0).strftime('%Y-%m-%d %H:%M:%S')
    # noinspection PyTypeChecker
    assert get_sql_date_from_epoch(
        'badger') == datetime.datetime.fromtimestamp(0).strftime(
            '%Y-%m-%d %H:%M:%S')
Exemplo n.º 2
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query("""
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {'media_id': story['media_id']}).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
Exemplo n.º 3
0
    def publish_date_sql(self) -> Optional[str]:
        """Return item publication date as a PostgreSQL-formatted string in a local timezone."""
        postgresql_date = None

        published_tuple = self._parsed_publish_date()
        if published_tuple:
            # FIXME unfortunately, Perl's implementation would make the timezone vanish, so dates & times would get
            # stored in machine's timezone in PostgreSQL (which is set to America/New_York in production). We haven't
            # added timezone to stories.publish_date column yet so we have to keep the present buggy behavior here.
            timestamp = int(calendar.timegm(published_tuple))
            postgresql_date = get_sql_date_from_epoch(timestamp)

        return postgresql_date
Exemplo n.º 4
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict,
                                     parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query(
        """
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {
            'media_id': story['media_id']
        }).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(
            download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
def test_add_stale_feeds():
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')

    pending_feeds = []

    feed = {
        'media_id': medium['media_id'],
        'name': 'null last download',
        'url': 'http://null last download',
        'type': 'syndicated',
        'active': True,
        'last_attempted_download_time': None
    }
    feed = db.create('feeds', feed)
    pending_feeds.append(feed)

    feed = {
        'media_id': medium['media_id'],
        'name': 'recent last download',
        'url': 'http://recent last download',
        'type': 'syndicated',
        'active': True,
        'last_attempted_download_time': sql_now()
    }
    db.create('feeds', feed)

    feed = {
        'media_id': medium['media_id'],
        'name': 'recent last new story',
        'url': 'http://recent last new story',
        'type': 'syndicated',
        'active': True,
        'last_attempted_download_time': sql_now(),
        'last_new_story_time': sql_now()
    }
    db.create('feeds', feed)

    feed = {
        'media_id':
        medium['media_id'],
        'name':
        '5 minute new story',
        'url':
        'http://5 minute new story',
        'type':
        'syndicated',
        'active':
        True,
        'last_attempted_download_time':
        get_sql_date_from_epoch(int(time.time()) - 300),
        'last_new_story_time':
        get_sql_date_from_epoch(int(time.time()) - 300),
    }
    feed = db.create('feeds', feed)
    pending_feeds.append(feed)

    feed = {
        'media_id':
        medium['media_id'],
        'name':
        'old last download',
        'url':
        'http://old last download',
        'type':
        'syndicated',
        'active':
        True,
        'last_attempted_download_time':
        get_sql_date_from_epoch(int(time.time()) - (86400 * 10))
    }
    feed = db.create('feeds', feed)
    pending_feeds.append(feed)

    _add_stale_feeds(db)

    num_pending_downloads = db.query(
        "select count(*) from downloads where state = 'pending'").flat()[0]
    assert num_pending_downloads == len(pending_feeds)

    for feed in pending_feeds:
        exists = db.query(
            "select * from downloads where state = 'pending' and feeds_id = %(a)s",
            {
                'a': feed['feeds_id']
            }).hash()
        assert exists, "download for feed %s added" % feed['name']
Exemplo n.º 6
0
def test_get_sql_date_from_epoch():
    assert get_sql_date_from_epoch(int(time.time())) == datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
    assert get_sql_date_from_epoch(0) == datetime.datetime.fromtimestamp(0).strftime('%Y-%m-%d %H:%M:%S')
    # noinspection PyTypeChecker
    assert get_sql_date_from_epoch('badger') == datetime.datetime.fromtimestamp(0).strftime('%Y-%m-%d %H:%M:%S')
Exemplo n.º 7
0
    def _get_stories_from_univision_feed(cls, content: str, media_id: int) -> List[Dict[str, Any]]:
        """Parse the feed. Return a (non-db-backed) story dict for each story found in the feed."""
        content = decode_object_from_bytes_if_needed(content)
        if isinstance(media_id, bytes):
            media_id = decode_object_from_bytes_if_needed(media_id)

        media_id = int(media_id)

        if not content:
            raise McCrawlerFetcherSoftError("Feed content is empty or undefined.")

        try:
            feed_json = decode_json(content)
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Unable to decode Univision feed JSON: {ex}")

        try:
            # Intentionally raise exception on KeyError:
            if not feed_json['status'] == 'success':
                raise McCrawlerFetcherSoftError(f"Univision feed response is not 'success': {content}")
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Unable to verify Univision feed status: {ex}")

        try:
            # Intentionally raise exception on KeyError:
            feed_items = feed_json.get('data', None).get('items', None)
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Univision feed response does not have 'data'/'items' key: {ex}")

        stories = []

        for item in feed_items:
            url = item.get('url', None)
            if not url:
                # Some items in the feed don't have their URLs set
                log.warning(f"'url' for item is not set: {item}")
                continue

            # sic -- we take "uid" (without "g") and call it "guid" (with "g")
            guid = item.get('uid', None)
            if not guid:
                raise McCrawlerFetcherSoftError(f"Item does not have its 'uid' set: {item}")

            title = item.get('title', '(no title)')
            description = item.get('description', '')

            try:
                # Intentionally raise exception on KeyError:
                str_publish_date = item['publishDate']
                publish_timestamp = str2time_21st_century(str_publish_date)
                publish_date = get_sql_date_from_epoch(publish_timestamp)
            except Exception as ex:
                # Die for good because Univision's dates should be pretty predictable
                raise McCrawlerFetcherSoftError(f"Unable to parse item's {item} publish date: {ex}")

            log.debug(f"Story found in Univision feed: URL '{url}', title '{title}', publish date '{publish_date}'")
            stories.append({
                'url': url,
                'guid': guid,
                'media_id': media_id,
                'publish_date': publish_date,
                'title': title,
                'description': description,
            })

        return stories
Exemplo n.º 8
0
    def test_add_stale_feeds(self) -> None:
        """Test _add_stale_feeds()."""
        db = self.db()

        medium = mediawords.test.db.create.create_test_medium(db, 'foo')

        pending_feeds = []

        feed = {
            'media_id': medium['media_id'],
            'name': 'null last download',
            'url': 'http://null last download',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': None
        }
        feed = db.create('feeds', feed)
        pending_feeds.append(feed)

        feed = {
            'media_id': medium['media_id'],
            'name': 'recent last download',
            'url': 'http://recent last download',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': sql_now()
        }
        feed = db.create('feeds', feed)

        feed = {
            'media_id': medium['media_id'],
            'name': 'recent last new story',
            'url': 'http://recent last new story',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': sql_now(),
            'last_new_story_time': sql_now()
        }
        feed = db.create('feeds', feed)

        feed = {
            'media_id': medium['media_id'],
            'name': '5 minute new story',
            'url': 'http://5 minute new story',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': get_sql_date_from_epoch(time.time() - 300),
            'last_new_story_time': get_sql_date_from_epoch(time.time() - 300),
        }
        feed = db.create('feeds', feed)
        pending_feeds.append(feed)

        feed = {
            'media_id': medium['media_id'],
            'name': 'old last download',
            'url': 'http://old last download',
            'type': 'syndicated',
            'active': True,
            'last_attempted_download_time': get_sql_date_from_epoch(time.time() - (86400 * 10))
        }
        feed = db.create('feeds', feed)
        pending_feeds.append(feed)

        mediawords.crawler.provider._add_stale_feeds(db)

        num_pending_downloads = db.query("select count(*) from downloads where state = 'pending'").flat()[0]
        assert num_pending_downloads == len(pending_feeds)

        for feed in pending_feeds:
            exists = db.query(
                "select * from downloads where state = 'pending' and feeds_id = %(a)s",
                {'a': feed['feeds_id']}).hash()
            assert exists, "download for feed %s added" % feed['name']