示例#1
0
def test_http_hash_server_multiple_servers():
    """Test running multiple hash servers at the same time."""

    port_1 = random_unused_port()
    port_2 = random_unused_port()

    base_url_1 = 'http://localhost:%d' % port_1
    base_url_2 = 'http://localhost:%d' % port_2

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_sleep_forever(
            request: HashServer.Request) -> Union[str, bytes]:
        time.sleep(9999)

    pages = {
        '/simple-page': 'Works!',
        '/sleep-forever': {
            'callback': __callback_sleep_forever
        },
    }

    hs_1 = HashServer(port=port_1, pages=pages)
    hs_2 = HashServer(port=port_2, pages=pages)

    assert hs_1
    assert hs_2

    hs_1.start()
    hs_2.start()

    assert tcp_port_is_open(port=port_1)
    assert tcp_port_is_open(port=port_2)

    for base_url in [base_url_1, base_url_2]:
        request_timed_out = False
        try:
            requests.get('%s/sleep-forever' % base_url, timeout=1)
        except requests.exceptions.Timeout:
            request_timed_out = True
        assert request_timed_out is True

        assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    hs_1.stop()
    hs_2.stop()

    assert tcp_port_is_open(port=port_1) is False
    assert tcp_port_is_open(port=port_2) is False
示例#2
0
    def setUp(self) -> None:
        super().setUp()

        self.__mock_data = os.urandom(1024 * 1024)

        # noinspection PyUnusedLocal
        def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
            response = "".encode('utf-8')
            response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
            response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
            response += f"Content-Length: {len(self.__mock_data)}\r\n".encode(
                'utf-8')
            response += "\r\n".encode('utf-8')
            response += self.__mock_data
            return response

        port = random_unused_port()
        pages = {
            '/test.mp3': {
                'callback': __mp3_callback,
            }
        }

        self.__hs = HashServer(port=port, pages=pages)
        self.__hs.start()

        self.__url = f"http://127.0.0.1:{port}/test.mp3"

        self.__temp_dir = tempfile.mkdtemp('test')
        self.__dest_file = os.path.join(self.__temp_dir, 'test.mp3')
    def test_extract_article_html_from_page_html_connection_errors(self):
        """Try extracting with connection errors."""

        # Use multiprocessing.Value() because request might be handled in a fork
        self.is_first_response = multiprocessing.Value('i', 1)

        pages = {
            '/extract': {
                'callback': self.__extract_but_initially_fail,
            }
        }
        port = random_unused_port()

        hs = HashServer(port=port, pages=pages)
        hs.start()

        class MockExtractorCommonConfig(CommonConfig):
            """Mock configuration which points to our unstable extractor."""
            def extractor_api_url(self) -> str:
                return f'http://localhost:{port}/extract'

        extractor_response = extract_article_html_from_page_html(
            content='whatever', config=MockExtractorCommonConfig())

        hs.stop()

        assert extractor_response
        assert 'extracted_html' in extractor_response
        assert 'extractor_version' in extractor_response

        assert extractor_response[
            'extracted_html'] == self.expected_extracted_text

        assert not self.is_first_response.value, "Make sure the initial extractor call failed."
示例#4
0
    def setUp(self):
        super().setUp()

        self.TEST_HTTP_SERVER_PORT = random_unused_port()
        self.TEST_HTTP_SERVER_URL = 'http://localhost:%d' % self.TEST_HTTP_SERVER_PORT

        self.STARTING_URL_WITHOUT_CRUFT = '%s/first' % self.TEST_HTTP_SERVER_URL
        self.STARTING_URL = self.STARTING_URL_WITHOUT_CRUFT + self.CRUFT
示例#5
0
    def setUp(self):
        super().setUp()

        self.TEST_HTTP_SERVER_PORT = random_unused_port()
        self.TEST_HTTP_SERVER_URL = 'http://localhost:%d' % self.TEST_HTTP_SERVER_PORT

        self.STARTING_URL_WITHOUT_CRUFT = '%s/first' % self.TEST_HTTP_SERVER_URL
        self.STARTING_URL = self.STARTING_URL_WITHOUT_CRUFT + self.CRUFT
def test_http_hash_server_multiple_servers():
    """Test running multiple hash servers at the same time."""

    port_1 = random_unused_port()
    port_2 = random_unused_port()

    base_url_1 = 'http://localhost:%d' % port_1
    base_url_2 = 'http://localhost:%d' % port_2

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]:
        time.sleep(9999)

    pages = {
        '/simple-page': 'Works!',
        '/sleep-forever': {'callback': __callback_sleep_forever},
    }

    hs_1 = HashServer(port=port_1, pages=pages)
    hs_2 = HashServer(port=port_2, pages=pages)

    assert hs_1
    assert hs_2

    hs_1.start()
    hs_2.start()

    assert tcp_port_is_open(port=port_1)
    assert tcp_port_is_open(port=port_2)

    for base_url in [base_url_1, base_url_2]:
        request_timed_out = False
        try:
            requests.get('%s/sleep-forever' % base_url, timeout=1)
        except requests.exceptions.Timeout:
            request_timed_out = True
        assert request_timed_out is True

        assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    hs_1.stop()
    hs_2.stop()

    assert tcp_port_is_open(port=port_1) is False
    assert tcp_port_is_open(port=port_2) is False
示例#7
0
def test_wait_for_tcp_port_to_close():
    random_port = random_unused_port()
    assert wait_for_tcp_port_to_close(port=random_port, retries=2) is True

    # Open port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', random_port))
    s.listen()
    assert wait_for_tcp_port_to_close(port=random_port, retries=2) is False

    # Close port
    s.close()
    assert wait_for_tcp_port_to_close(port=random_port, retries=2) is True
示例#8
0
    def setUp(self) -> None:
        self.db = connect_to_db()

        self.port = random_unused_port()

        self.__hs = HashServer(port=self.port, pages=self.hashserver_pages())
        self.__hs.start()

        self.media = create_test_story_stack(db=self.db,
                                             data={'A': {
                                                 'B': [1]
                                             }})
        self.feed = self.media['A']['feeds']['B']
示例#9
0
def test_tcp_port_is_open():
    random_port = random_unused_port()
    assert tcp_port_is_open(random_port) is False

    # Open port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', random_port))
    s.listen()
    assert tcp_port_is_open(random_port) is True

    # Close port
    s.close()
    assert tcp_port_is_open(random_port) is False
示例#10
0
def test_wait_for_tcp_port_to_close():
    random_port = random_unused_port()
    assert wait_for_tcp_port_to_close(port=random_port, retries=2) is True

    # Open port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', random_port))
    s.listen()
    assert wait_for_tcp_port_to_close(port=random_port, retries=2) is False

    # Close port
    s.close()
    assert wait_for_tcp_port_to_close(port=random_port, retries=2) is True
示例#11
0
def test_tcp_port_is_open():
    random_port = random_unused_port()
    assert tcp_port_is_open(random_port) is False

    # Open port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', random_port))
    s.listen()
    assert tcp_port_is_open(random_port) is True

    # Close port
    s.close()
    assert tcp_port_is_open(random_port) is False
示例#12
0
def test_http_hash_server_stop():
    """Test if HTTP hash server gets stopped properly (including children)."""
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_sleep_forever(
            request: HashServer.Request) -> Union[str, bytes]:
        time.sleep(9999)

    pages = {
        '/simple-page': 'Works!',
        '/sleep-forever': {
            'callback': __callback_sleep_forever
        },
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    request_timed_out = False
    try:
        requests.get('%s/sleep-forever' % base_url, timeout=1)
    except requests.exceptions.Timeout:
        request_timed_out = True
    assert request_timed_out is True

    assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    # Restart the server with the same port, make sure it works again, i.e. the server gets stopped properly, kills all
    # its children and releases the port
    hs.stop()

    assert tcp_port_is_open(port=port) is False

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port) is True

    assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    hs.stop()
def test_http_hash_server_stop():
    """Test if HTTP hash server gets stopped properly (including children)."""
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]:
        time.sleep(9999)

    pages = {
        '/simple-page': 'Works!',
        '/sleep-forever': {'callback': __callback_sleep_forever},
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    request_timed_out = False
    try:
        requests.get('%s/sleep-forever' % base_url, timeout=1)
    except requests.exceptions.Timeout:
        request_timed_out = True
    assert request_timed_out is True

    assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    # Restart the server with the same port, make sure it works again, i.e. the server gets stopped properly, kills all
    # its children and releases the port
    hs.stop()

    assert tcp_port_is_open(port=port) is False

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port) is True

    assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    hs.stop()
示例#14
0
def test_run_fetcher():
    db = connect_to_db()

    medium = create_test_medium(db=db, label='foo')
    feed = create_test_feed(db=db, label='foo', medium=medium)
    story = create_test_story(db=db, label='foo', feed=feed)

    port = random_unused_port()
    pages = {
        '/foo': 'foo',
        '/bar': 'bar',
    }

    hs = HashServer(port=port, pages=pages)
    hs.start()

    download = db.create(table='downloads',
                         insert_hash={
                             'state': 'pending',
                             'feeds_id': feed['feeds_id'],
                             'stories_id': story['stories_id'],
                             'type': 'content',
                             'sequence': 1,
                             'priority': 1,
                             'url': f"http://localhost:{port}/foo",
                             'host': 'localhost',
                         })

    db.query("""
        INSERT INTO queued_downloads (downloads_id)
        SELECT downloads_id FROM downloads
    """)

    run_fetcher(no_daemon=True)

    test_download = db.find_by_id(table='downloads',
                                  object_id=download['downloads_id'])
    assert test_download['state'] == 'success'
示例#15
0
    def setUp(self):
        super().setUp()

        self.__test_port = random_unused_port()
        self.__test_url = 'http://localhost:%d' % self.__test_port
示例#16
0
    def setUp(self):
        super().setUp()

        self.__test_port = random_unused_port()
        self.__test_url = 'http://localhost:%d' % self.__test_port
示例#17
0
async def test_workflow():
    db = connect_to_db()

    test_medium = create_test_medium(db=db, label='test')
    test_feed = create_test_feed(db=db, label='test', medium=test_medium)

    # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be
    # used to guess the probable language of the podcast episode
    test_story = create_test_story(db=db,
                                   label='keeping up with Kardashians',
                                   feed=test_feed)

    stories_id = test_story['stories_id']

    with open(TEST_MP3_PATH, mode='rb') as f:
        test_mp3_data = f.read()

    # noinspection PyUnusedLocal
    def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
        response = "".encode('utf-8')
        response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
        response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
        response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8')
        response += "\r\n".encode('utf-8')
        response += test_mp3_data
        return response

    port = random_unused_port()
    pages = {
        '/test.mp3': {
            'callback': __mp3_callback,
        }
    }

    hs = HashServer(port=port, pages=pages)
    hs.start()

    # Not localhost as this might get fetched from a remote worker
    mp3_url = hs.page_url('/test.mp3')

    db.insert(table='story_enclosures',
              insert_hash={
                  'stories_id': stories_id,
                  'url': mp3_url,
                  'mime_type': 'audio/mpeg',
                  'length': len(test_mp3_data),
              })

    client = workflow_client()

    # Start worker
    factory = WorkerFactory(client=client, namespace=client.namespace)
    worker = factory.new_worker(task_queue=TASK_QUEUE)

    # Use an activities implementation with random GCS prefixes set
    activities = _RandomPrefixesPodcastTranscribeActivities()

    worker.register_activities_implementation(
        activities_instance=activities,
        activities_cls_name=PodcastTranscribeActivities.__name__,
    )
    worker.register_workflow_implementation_type(
        impl_cls=PodcastTranscribeWorkflowImpl)
    factory.start()

    # Initialize workflow instance
    workflow: PodcastTranscribeWorkflow = client.new_workflow_stub(
        cls=PodcastTranscribeWorkflow,
        workflow_options=WorkflowOptions(
            workflow_id=str(stories_id),

            # By default, if individual activities of the workflow fail, they will get restarted pretty much
            # indefinitely, and so this test might run for days (or rather just timeout on the CI). So we cap the
            # workflow so that if it doesn't manage to complete in X minutes, we consider it as failed.
            workflow_run_timeout=timedelta(minutes=5),
        ),
    )

    # Wait for the workflow to complete
    await workflow.transcribe_episode(stories_id)

    downloads = db.select(table='downloads', what_to_select='*').hashes()
    assert len(downloads) == 1
    first_download = downloads[0]
    assert first_download['stories_id'] == stories_id
    assert first_download['type'] == 'content'
    assert first_download['state'] == 'success'

    download_content = fetch_content(db=db, download=first_download)

    # It's what gets said in the sample MP3 file
    assert 'Kim Kardashian' in download_content

    # Initiate the worker shutdown in the background while we do the GCS cleanup so that the stop_workers_faster()
    # doesn't have to wait that long
    await worker.stop(background=True)

    log.info("Cleaning up GCS...")
    GCSStore(bucket_config=activities.config.raw_enclosures()).delete_object(
        object_id=str(stories_id))
    GCSStore(
        bucket_config=activities.config.transcoded_episodes()).delete_object(
            object_id=str(stories_id))
    GCSStore(bucket_config=activities.config.transcripts()).delete_object(
        object_id=str(stories_id))
    log.info("Cleaned up GCS")

    log.info("Stopping workers...")
    await stop_worker_faster(worker)
    log.info("Stopped workers")
示例#18
0
def test_random_unused_port():
    random_port = random_unused_port()
    assert tcp_port_is_open(random_port) is False
示例#19
0
def test_http_hash_server_multiple_clients():
    """Test running hash server with multiple clients."""

    port = random_unused_port()

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_timeout(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "\r\n"
        r += "And now we wait"
        time.sleep(10)
        return str.encode(r)

    pages = {
        '/a': '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.',
        '/timeout': {
            'callback': __callback_timeout
        },
        # '/does-not-exist': '404',
        '/b': '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.',
        '/c': '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.',
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    base_url = 'http://localhost:%d' % port

    session = FuturesSession(max_workers=10)

    future_a = session.get('%s/a' % base_url, timeout=2)
    future_timeout = session.get('%s/timeout' % base_url, timeout=2)
    future_404 = session.get('%s/does-not-exist' % base_url, timeout=2)
    future_b = session.get('%s/b' % base_url, timeout=2)
    future_c = session.get('%s/c' % base_url, timeout=2)

    response_a = future_a.result()

    with pytest.raises(requests.Timeout):
        future_timeout.result()

    response_404 = future_404.result()
    response_b = future_b.result()
    response_c = future_c.result()

    assert response_b.status_code == 200
    assert response_b.text == '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.'

    assert response_c.status_code == 200
    assert response_c.text == '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.'

    assert response_404.status_code == 404

    assert response_a.status_code == 200
    assert response_a.text == '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.'

    hs.stop()
示例#20
0
def test_cliff_annotator():

    db = connect_to_db()

    media = db.create(table='media', insert_hash={
        'name': "test medium",
        'url': "url://test/medium",
    })

    story = db.create(table='stories', insert_hash={
        'media_id': media['media_id'],
        'url': 'url://story/a',
        'guid': 'guid://story/a',
        'title': 'story a',
        'description': 'description a',
        'publish_date': sql_now(),
        'collect_date': sql_now(),
        'full_text_rss': True,
    })
    stories_id = story['stories_id']

    db.create(table='story_sentences', insert_hash={
        'stories_id': stories_id,
        'sentence_number': 1,
        'sentence': 'I hope that the CLIFF annotator is working.',
        'media_id': media['media_id'],
        'publish_date': sql_now(),
        'language': 'en'
    })

    def __cliff_sample_response(_: HashServer.Request) -> Union[str, bytes]:
        """Mock annotator."""
        response = ""
        response += "HTTP/1.0 200 OK\r\n"
        response += "Content-Type: application/json; charset=UTF-8\r\n"
        response += "\r\n"
        response += encode_json(sample_cliff_response())
        return response

    pages = {
        '/cliff/parse/text': {
            'callback': __cliff_sample_response,
        }
    }

    port = random_unused_port()
    annotator_url = 'http://localhost:%d/cliff/parse/text' % port

    hs = HashServer(port=port, pages=pages)
    hs.start()

    class TestCLIFFFetcherConfig(CLIFFFetcherConfig):
        @staticmethod
        def annotator_url() -> str:
            return annotator_url

    cliff = CLIFFAnnotatorFetcher(fetcher_config=TestCLIFFFetcherConfig())
    cliff.annotate_and_store_for_story(db=db, stories_id=stories_id)

    hs.stop()

    annotation_exists = db.query("""
        SELECT 1
        FROM cliff_annotations
        WHERE object_id = %(object_id)s
    """, {'object_id': stories_id}).hash()
    assert annotation_exists is not None
示例#21
0
    def test_tagging(self):
        db = connect_to_db()

        media = db.create(table='media',
                          insert_hash={
                              'name': "test medium",
                              'url': "url://test/medium",
                          })

        story = db.create(table='stories',
                          insert_hash={
                              'media_id': media['media_id'],
                              'url': 'url://story/a',
                              'guid': 'guid://story/a',
                              'title': 'story a',
                              'description': 'description a',
                              'publish_date': sql_now(),
                              'collect_date': sql_now(),
                              'full_text_rss': True,
                          })
        stories_id = story['stories_id']

        db.create(table='story_sentences',
                  insert_hash={
                      'stories_id': stories_id,
                      'sentence_number': 1,
                      'sentence':
                      'I hope that the CLIFF annotator is working.',
                      'media_id': media['media_id'],
                      'publish_date': sql_now(),
                      'language': 'en'
                  })

        def __cliff_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(sample_cliff_response())
            return response

        pages = {
            '/cliff/parse/text': {
                'callback': __cliff_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/cliff/parse/text' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        class TestCLIFFFetcherConfig(CLIFFTagsFromAnnotationConfig):
            @staticmethod
            def annotator_url() -> str:
                return annotator_url

        cliff = CLIFFTagsFromAnnotation(tagger_config=TestCLIFFFetcherConfig())
        cliff.update_tags_for_story(db=db, stories_id=stories_id)

        hs.stop()

        story_tags = db.query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY
                lower(tag_sets.name),
                lower(tags.tag)
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = expected_cliff_tags()

        assert story_tags == expected_tags
示例#22
0
def test_fetch_and_store_episode():
    db = connect_to_db()

    test_medium = create_test_medium(db=db, label='test')
    test_feed = create_test_feed(db=db, label='test', medium=test_medium)

    # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be
    # used to guess the probable language of the podcast episode
    test_story = create_test_story(db=db,
                                   label='keeping up with Kardashians',
                                   feed=test_feed)

    stories_id = test_story['stories_id']

    with open(TEST_MP3_PATH, mode='rb') as f:
        test_mp3_data = f.read()

    # noinspection PyUnusedLocal
    def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
        response = "".encode('utf-8')
        response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
        response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
        response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8')
        response += "\r\n".encode('utf-8')
        response += test_mp3_data
        return response

    port = random_unused_port()
    pages = {
        '/test.mp3': {
            'callback': __mp3_callback,
        }
    }

    hs = HashServer(port=port, pages=pages)
    hs.start()

    mp3_url = f'http://127.0.0.1:{port}/test.mp3'

    story_enclosure = db.insert(table='story_enclosures',
                                insert_hash={
                                    'stories_id': stories_id,
                                    'url': mp3_url,
                                    'mime_type': 'audio/mpeg',
                                    'length': len(test_mp3_data),
                                })

    conf = RandomPathPrefixConfig()
    fetch_and_store_episode(db=db, stories_id=stories_id, config=conf)

    episodes = db.select(table='podcast_episodes', what_to_select='*').hashes()
    assert len(episodes), f"Only one episode is expected."

    episode = episodes[0]
    assert episode['stories_id'] == stories_id
    assert episode['story_enclosures_id'] == story_enclosure[
        'story_enclosures_id']
    assert episode[
        'gcs_uri'] == f"gs://{conf.gc_storage_bucket_name()}/{conf.gc_storage_path_prefix()}/{stories_id}"
    assert episode['duration'] > 0
    assert episode['codec'] == 'MP3'
    assert episode['sample_rate'] == 44100
    assert episode['bcp47_language_code'] == 'en-US'

    # Try removing test object
    gcs = GCSStore(config=conf)
    gcs.delete_object(object_id=str(stories_id))
示例#23
0
def test_random_unused_port():
    random_port = random_unused_port()
    assert tcp_port_is_open(random_port) is False
示例#24
0
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'method': request.method(),
            'url': request.url(),
            'content-type': request.content_type(),
            'params': request.query_params(),
            'cookies': request.cookies(),
        })
        return str.encode(r)

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(request: HashServer.Request) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    def __callback_post(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback_post',
            'post_data': request.content(),
        })
        return str.encode(r)

    pages = {
        '/': 'home',
        '/foo': b'foo',
        '/bar': 'bar ąą',
        '/foo-bar': {
            b'redirect': b'/bar'
        },
        '/localhost': {
            'redirect': "http://localhost:%d/" % port
        },
        b'/127-foo': {
            b'redirect': "http://127.0.0.1:%d/foo" % port
        },
        '/auth': {
            b'auth': b'foo:bar',
            b'content': b"foo bar \xf0\x90\x28\xbc"
        },
        '/404': {
            b'content': b'not found',
            b'http_status_code': 404
        },
        '/callback': {
            b'callback': __simple_callback
        },

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {
            'callback': __callback_cookie_redirect
        },

        # POST data
        '/callback_post': {
            'callback': __callback_post
        },
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    # Path normalization
    assert str(requests.get('%s//' % base_url).text) == 'home'
    assert str(requests.get('%s///' % base_url).text) == 'home'
    assert str(requests.get('%s/something/../' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..//' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..///' % base_url).text) == 'home'
    assert str(requests.get('%s/foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo///' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url,
                                 cookies={
                                     'cookie_name': 'cookie_value'
                                 }).json()
    assert response_json == {
        'name': 'callback',
        'method': 'GET',
        'url': 'http://localhost:%d/callback?a=b&c=d' % port,
        'content-type': None,
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url,
                            allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url,
                        auth=('foo',
                              'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.content == b"foo bar \xf0\x90\x28\xbc"

    assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'),
                          url2='http://localhost:%d/callback' % port)
    with pytest.raises(McHashServerException):
        hs.page_url('/does-not-exist')

    response_json = requests.post('%s/callback_post' % base_url,
                                  data='abc=def').json()
    assert response_json == {
        'name': 'callback_post',
        'post_data': 'abc=def',
    }

    hs.stop()
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'method': request.method(),
            'url': request.url(),
            'content-type': request.content_type(),
            'params': request.query_params(),
            'cookies': request.cookies(),
        })
        return str.encode(r)

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(request: HashServer.Request) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    def __callback_post(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback_post',
            'post_data': request.content(),
        })
        return str.encode(r)

    pages = {
        '/': 'home',
        '/foo': b'foo',
        '/bar': 'bar ąą',
        '/foo-bar': {b'redirect': b'/bar'},
        '/localhost': {'redirect': "http://localhost:%d/" % port},
        b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port},
        '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"},
        '/404': {b'content': b'not found', b'http_status_code': 404},
        '/callback': {b'callback': __simple_callback},

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {'callback': __callback_cookie_redirect},

        # POST data
        '/callback_post': {'callback': __callback_post},
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    # Path normalization
    assert str(requests.get('%s//' % base_url).text) == 'home'
    assert str(requests.get('%s///' % base_url).text) == 'home'
    assert str(requests.get('%s/something/../' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..//' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..///' % base_url).text) == 'home'
    assert str(requests.get('%s/foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo///' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json()
    assert response_json == {
        'name': 'callback',
        'method': 'GET',
        'url': 'http://localhost:%d/callback?a=b&c=d' % port,
        'content-type': None,
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.content == b"foo bar \xf0\x90\x28\xbc"

    assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port)
    with pytest.raises(McHashServerException):
        hs.page_url('/does-not-exist')

    response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json()
    assert response_json == {
        'name': 'callback_post',
        'post_data': 'abc=def',
    }

    hs.stop()
def test_http_hash_server_multiple_clients():
    """Test running hash server with multiple clients."""

    port = random_unused_port()

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_timeout(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "\r\n"
        r += "And now we wait"
        time.sleep(10)
        return str.encode(r)

    pages = {
        '/a': '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.',
        '/timeout': {'callback': __callback_timeout},
        # '/does-not-exist': '404',
        '/b': '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.',
        '/c': '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.',
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    base_url = 'http://localhost:%d' % port

    session = FuturesSession(max_workers=10)

    future_a = session.get('%s/a' % base_url, timeout=2)
    future_timeout = session.get('%s/timeout' % base_url, timeout=2)
    future_404 = session.get('%s/does-not-exist' % base_url, timeout=2)
    future_b = session.get('%s/b' % base_url, timeout=2)
    future_c = session.get('%s/c' % base_url, timeout=2)

    response_a = future_a.result()

    with pytest.raises(requests.Timeout):
        future_timeout.result()

    response_404 = future_404.result()
    response_b = future_b.result()
    response_c = future_c.result()

    assert response_b.status_code == 200
    assert response_b.text == '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.'

    assert response_c.status_code == 200
    assert response_c.text == '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.'

    assert response_404.status_code == 404

    assert response_a.status_code == 200
    assert response_a.text == '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.'

    hs.stop()
示例#27
0
    def setUpClass(cls) -> None:
        super().setUpClass()

        cls.PORT = random_unused_port()
        cls.URL = f'http://localhost:{cls.PORT}'
 def __init__(self, pages: Dict[str, Any]):
     self.__port = random_unused_port()
     self.__hs = HashServer(port=self.__port, pages=pages)
     self.__hs.start()
示例#29
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media', insert_hash={
            'name': "test medium",
            'url': "url://test/medium",
        })

        story = self.db().create(table='stories', insert_hash={
            'media_id': media['media_id'],
            'url': 'url://story/a',
            'guid': 'guid://story/a',
            'title': 'story a',
            'description': 'description a',
            'publish_date': sql_now(),
            'collect_date': sql_now(),
            'full_text_rss': True,
        })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences', insert_hash={
            'stories_id': stories_id,
            'sentence_number': 1,
            'sentence': 'I hope that the CLIFF annotator is working.',
            'media_id': media['media_id'],
            'publish_date': sql_now(),
            'language': 'en'
        })

        def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query("""
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {'object_id': stories_id}).hash()
        assert annotation_exists is not None

        story_tags = self.db().query("""
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {'stories_id': stories_id}).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
示例#30
0
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(params: dict, cookies: dict) -> str:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'params': params,
            'cookies': cookies,
        })
        return r

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(params: dict, cookies: dict) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    pages = {
        '/': 'home',
        '/foo': 'foo',
        '/bar': 'bar',
        '/foo-bar': {
            'redirect': '/bar'
        },
        '/localhost': {
            'redirect': "http://localhost:%d/" % port
        },
        '/127-foo': {
            'redirect': "http://127.0.0.1:%d/foo" % port
        },
        '/auth': {
            'auth': 'foo:bar',
            'content': 'foo bar'
        },
        '/404': {
            'content': 'not found',
            'http_status_code': 404
        },
        '/callback': {
            'callback': __simple_callback
        },

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {
            'callback': __callback_cookie_redirect
        },
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url,
                                 cookies={
                                     'cookie_name': 'cookie_value'
                                 }).json()
    assert response_json == {
        'name': 'callback',
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url,
                            allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url,
                        auth=('foo',
                              'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.text == 'foo bar'

    assert hs.page_url(
        '/callback?a=b&c=d') == 'http://localhost:%d/callback' % port
    assert_raises(McHashServerException, hs.page_url, '/does-not-exist')

    hs.stop()
示例#31
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media',
                                 insert_hash={
                                     'name': "test medium",
                                     'url': "url://test/medium",
                                 })

        story = self.db().create(table='stories',
                                 insert_hash={
                                     'media_id': media['media_id'],
                                     'url': 'url://story/a',
                                     'guid': 'guid://story/a',
                                     'title': 'story a',
                                     'description': 'description a',
                                     'publish_date': sql_now(),
                                     'collect_date': sql_now(),
                                     'full_text_rss': True,
                                 })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences',
                         insert_hash={
                             'stories_id': stories_id,
                             'sentence_number': 1,
                             'sentence':
                             'I hope that the CLIFF annotator is working.',
                             'media_id': media['media_id'],
                             'publish_date': sql_now(),
                             'language': 'en'
                         })

        def __nyt_labels_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(),
                                               stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query(
            """
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {
                'object_id': stories_id
            }).hash()
        assert annotation_exists is not None

        story_tags = self.db().query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags