예제 #1
0
    def test_preprocess_all_article_content(self):
        """Article content is preprocessed and uploaded to database."""

        preprocessor = preprocessing.BagOfWordsPreprocessor()
        preprocessor.preprocess_all_article_content()

        # Scenario 1
        # The daily mail article already has preprocessed content so no new rows will be added
        expected_daily_mail = pd.DataFrame(data={
            'id': ['3587c1cb3b85d116d9573897437fc4db'],
            'processed_content': 'some preprocessed content'
        })

        # Retrieve the table to see if it was populated correctly
        db_connection = postgresql.DatabaseConnection()
        db_connection._create_connection()

        with db_connection._conn.cursor() as curs:
            curs.execute('SELECT * FROM daily_mail.article_content_bow_preprocessed;')

            table_tuples = curs.fetchall()
            actual_daily_mail = pd.DataFrame(table_tuples, columns=['id', 'processed_content'])

            db_connection._conn.commit()

        pd.testing.assert_frame_equal(actual_daily_mail, expected_daily_mail)

        # Scenario 2
        # The guardian article has not been preprocessed yet and will be added
        expected_guardian = pd.DataFrame(data={
            'id': ['e8c5e312fae36c43d965a0e3da84e68d'],
            'processed_content': 'margaret thatcher britain female prime minister resign 22 november 1990'
        })

        # Retrieve the table to see if it was populated correctly
        db_connection = postgresql.DatabaseConnection()
        db_connection._create_connection()

        with db_connection._conn.cursor() as curs:
            curs.execute('SELECT * FROM the_guardian.article_content_bow_preprocessed;')

            table_tuples = curs.fetchall()
            actual_guardian = pd.DataFrame(table_tuples, columns=['id', 'processed_content'])

            # Only retain the first 10 words to compare against
            actual_guardian['processed_content'] = actual_guardian['processed_content'].apply(
                lambda x: ' '.join(x.split()[:10])
            )

            # Tidy up and delete newly inserted rows
            # (those that don't exist in the staging data Docker/db/staging_data/daily_mail.columnists.csv)
            curs.execute("TRUNCATE TABLE the_guardian.article_content_bow_preprocessed;")

            db_connection._conn.commit()

        pd.testing.assert_frame_equal(actual_guardian, expected_guardian)
예제 #2
0
def test_upload_dataframe():
    """Dataframe is successfully written to a permanent postgres table."""

    db_connection = postgresql.DatabaseConnection()

    # Create dataframe and upload to postgres
    expected_df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
    db_connection.upload_dataframe(
        dataframe=expected_df,
        schema='testing_schema',
        table_name='uploaded_dataframe',
        index=False,
    )

    # Retrieve the table and see if it was uploaded correctly
    db_connection._create_connection()
    with db_connection._conn.cursor() as cursor:
        cursor.execute('SELECT * FROM testing_schema.uploaded_dataframe;')

        table_tuples = cursor.fetchall()
        actual_df = pd.DataFrame(table_tuples, columns=['col1', 'col2'])

        # Tidy up
        cursor.execute('DROP TABLE testing_schema.uploaded_dataframe;')
        db_connection._conn.commit()

    pd.testing.assert_frame_equal(left=actual_df, right=expected_df)
예제 #3
0
def test_check_database_is_live(monkeypatch):
    """Exception is raised if database is not alive and accepting connections after pausing and retrying once more."""

    # SCENARIO 1: Connection succeeds on first attempt
    db_connection_succeed_first_time = postgresql.DatabaseConnection()
    assert db_connection_succeed_first_time._initial_database_response_success

    # SCENARIO 2: Connection does not succeed at all

    # Avoid having to wait before retrying
    def mock_sleep(secs: int = None):
        """Mock how the sleep function is called but do not actually delay code running."""
        pass

    monkeypatch.setattr(time, 'sleep', mock_sleep)

    with pytest.raises(psycopg2.OperationalError):
        postgresql.DatabaseConnection(database='incorrect_database')
예제 #4
0
def test_record_columnists_recent_article_links():
    """The URLs for Daily Mail columnists are successfully extracted and stored in database."""

    # Inspect database table before it is populated
    db_connection = postgresql.DatabaseConnection()
    db_connection._create_connection()

    with db_connection._conn.cursor() as curs:
        curs.execute('SELECT * FROM daily_mail.columnist_article_links;')

        table_tuples = curs.fetchall()
        table_before_extracting = pd.DataFrame(
            table_tuples, columns=['columnist', 'article_id', 'url'])

        db_connection._conn.commit()

    # Run scraper and inspect database table after it is populated
    article_downloader = daily_mail.ArticleDownloader()
    article_downloader.record_columnists_recent_article_links()

    with db_connection._conn.cursor() as curs:
        curs.execute('SELECT * FROM daily_mail.columnist_article_links;')

        table_tuples = curs.fetchall()
        table_after_extracting = pd.DataFrame(
            table_tuples, columns=['columnist', 'article_id', 'url'])

        # Tidy up and return table to its original form
        original_data = pd.read_csv(
            'Docker/db/staging_data/daily_mail.columnist_article_links.csv')
        original_urls = original_data['url'].values

        curs.execute(
            query=
            'DELETE FROM daily_mail.columnist_article_links WHERE url NOT IN %(original_urls)s',
            vars={'original_urls': tuple(original_urls)})

        db_connection._conn.commit()

    # The website page evolves over time, so perform checks which do not depend on the actual URLs

    # New rows have been added
    assert table_after_extracting.shape[0] > table_before_extracting.shape[0]

    # Appropriate URLs have been scraped
    assert all(
        url.startswith('https://www.dailymail.co.uk/')
        for url in table_after_extracting['url'].values)

    # Only links for the relevant columnists (stored in daily_mail.columnists) have been pulled
    df_expected_columnists = pd.read_csv(
        'Docker/db/staging_data/daily_mail.columnists.csv')
    expected_columnists = df_expected_columnists['columnist'].unique().tolist()

    assert table_after_extracting['columnist'].unique().tolist(
    ) == expected_columnists
예제 #5
0
def test_is_value_already_in_table(value, expected_output):
    """Value is correctly identified as existing or not in a database table."""

    db_connection = postgresql.DatabaseConnection()

    assert db_connection.is_value_already_in_table(
        value=value,
        table_name='testing_table',
        schema='testing_schema',
        column='example_string'
    ) == expected_output
예제 #6
0
def test_get_dataframe_from_query():
    """All data from a table can be retrieved by providing a query."""

    db_connection = postgresql.DatabaseConnection()

    expected = pd.DataFrame(columns=['example_integer', 'example_string'], data=[[1, 'First value']])

    actual = db_connection.get_dataframe(
        query="SELECT example_integer, example_string FROM testing_schema.testing_table WHERE example_integer = 1;"
    )

    pd.testing.assert_frame_equal(left=actual, right=expected)
예제 #7
0
    def __init__(self, use_existing_vocab: bool = True):
        """
        Initialise attributes of class.

        Parameters
        ----------
        use_existing_vocab : bool (default True)
            Whether to re-fit the vectoriser using all of the article texts (False) or to use an existing vocabulary
            produced by a previous run (True, default).
        """

        self._db_connection = postgresql.DatabaseConnection()
        self._use_existing_vocab = use_existing_vocab
예제 #8
0
def test_get_dataframe_from_query_with_parameters():
    """All data from a table can be retrieved by providing a parameterised query."""

    db_connection = postgresql.DatabaseConnection()

    expected = pd.DataFrame(columns=['example_integer', 'example_string'], data=[[1, 'First value']])

    actual = db_connection.get_dataframe(
        query=("SELECT example_integer, example_string "
               "FROM testing_schema.testing_table WHERE example_integer = %(id)s;"),
        query_params={'id': 1}
    )

    pd.testing.assert_frame_equal(left=actual, right=expected)
예제 #9
0
    def test_analyse_and_overwrite_existing_vocabulary(self, monkeypatch):
        """Vocabulary is extracted from preprocessed version of articles and stored to database."""

        # Establish encoder which loads a mock version of article content
        def mock_preprocessed_content():
            """Mock representation of preprocessed articles."""

            return pd.DataFrame(
                data={
                    'id': ['article_1', 'article_2'],
                    'processed_content': ['some words', 'some more words']
                })

        tfidf_encoder = encoding.TfidfEncoder()
        monkeypatch.setattr(tfidf_encoder,
                            '_load_all_articles_bow_preprocessed_content',
                            mock_preprocessed_content)

        tfidf_encoder._analyse_and_overwrite_existing_vocabulary(
            mock_preprocessed_content()['processed_content'].values)

        # # Query the table to see if the new rows were inserted correctly
        expected_vocabulary = pd.DataFrame(data={
            'word': ['more', 'some', 'words'],
            'feature_matrix_index': [0, 1, 2]
        })

        db_connection = postgresql.DatabaseConnection()
        db_connection._create_connection()

        with db_connection._conn.cursor() as cursor:
            cursor.execute("SELECT * FROM encoded_articles.tfidf_vocabulary;")

            table_tuples = cursor.fetchall()
            actual_vocabulary = pd.DataFrame(
                data=table_tuples, columns=['word', 'feature_matrix_index'])

            # Tidy up and revert to original version of table
            cursor.execute("TRUNCATE TABLE encoded_articles.tfidf_vocabulary;")

            cursor.execute("""
                COPY encoded_articles.tfidf_vocabulary
                FROM '/staging_data/encoded_articles.tfidf_vocabulary.csv'
                WITH CSV HEADER;
                """)

            db_connection._conn.commit()
        db_connection._close_connection()

        pd.testing.assert_frame_equal(actual_vocabulary, expected_vocabulary)
예제 #10
0
def test_get_dataframe_from_table_name():
    """All data from a table can be retrieved simply by providing a schema and table name."""

    db_connection = postgresql.DatabaseConnection()

    expected = pd.DataFrame(
        columns=['example_integer', 'example_string', 'example_timestamp'],
        data=[[1, 'First value', '2020-01-21T01:53:00Z'], [2, 'Second value', '2020-07-16T03:31:00Z']]
    )

    expected['example_timestamp'] = pd.to_datetime(expected['example_timestamp'], format='%Y-%m-%dT%H:%M:%SZ')

    actual = db_connection.get_dataframe(schema="testing_schema", table_name="testing_table")

    pd.testing.assert_frame_equal(left=actual, right=expected)
예제 #11
0
def test_upload_new_data_only_to_existing_table_inserts_new_rows_only():
    """Only new rows are inserted into an existing table."""

    db_connection = postgresql.DatabaseConnection()

    # Create dataframe with one row where the example_integer index already exists in the target table and should not be
    # inserted (1), and a new index (99) which should go in.
    # Also reorganise the rows to a different order to make sure it is handled
    rows_to_upload = pd.DataFrame(
        data={
            'example_string': ["Won't be inserted", "Will be inserted"],
            'example_integer': [1, 99],
            'example_timestamp': [datetime.datetime(2020, 11, 10, 15, 20, 37, 0),
                                  datetime.datetime(2035, 6, 10, 19, 3, 4, 0)]
        }
    )

    db_connection.upload_new_data_only_to_existing_table(
        dataframe=rows_to_upload,
        table_name='testing_table',
        schema='testing_schema',
        id_column='example_integer'
    )

    # Retrieve the table and see if it was inserted into correctly
    expected_df = pd.DataFrame(
        data={
            'example_integer': [1, 2, 99],
            'example_string': ["First value", "Second value", "Will be inserted"],
            'example_timestamp': [datetime.datetime(2020, 1, 21, 1, 53, 0, 0),
                                  datetime.datetime(2020, 7, 16, 3, 31, 0, 0),
                                  datetime.datetime(2035, 6, 10, 19, 3, 4, 0)]
        }
    )

    db_connection._create_connection()
    with db_connection._conn.cursor() as cursor:
        cursor.execute('SELECT * FROM testing_schema.testing_table;')

        table_tuples = cursor.fetchall()
        actual_df = pd.DataFrame(table_tuples, columns=['example_integer', 'example_string', 'example_timestamp'])

        # Tidy up
        cursor.execute('DELETE FROM testing_schema.testing_table WHERE example_integer = 99;')
        db_connection._conn.commit()

    pd.testing.assert_frame_equal(left=actual_df, right=expected_df)
예제 #12
0
def test_execute_database_operation():
    """SQL command successfully executed on database."""

    db_connection = postgresql.DatabaseConnection()

    # Mock data which will be inserted into permanent testing table as a new row
    example_integer = 3
    example_string = 'Third value'
    example_timestamp = datetime.datetime(2020, 11, 10, 15, 20, 37, 0)

    db_connection.execute_database_operation(
        sql_command=("INSERT INTO testing_schema.testing_table(example_integer, example_string, example_timestamp) "
                     "VALUES (%(example_integer)s, %(example_string)s, %(example_timestamp)s)"),
        params={
            'example_integer': example_integer,
            'example_string': example_string,
            'example_timestamp': example_timestamp
        }
    )

    # Outline what the new row should look like
    expected_df = pd.DataFrame(
        data=[[example_integer, example_string, example_timestamp]],
        columns=['example_integer', 'example_string', 'example_timestamp']
    )

    # Query the table to see if the new row was inserted correctly
    db_connection._create_connection()
    with db_connection._conn.cursor() as cursor:
        cursor.execute(
            query="SELECT * FROM testing_schema.testing_table WHERE example_integer = %(example_integer)s;",
            vars={'example_integer': example_integer}
        )

        table_tuples = cursor.fetchall()
        actual_df = pd.DataFrame(data=table_tuples, columns=['example_integer', 'example_string', 'example_timestamp'])

        # Tidy up
        db_connection.execute_database_operation(
            sql_command="DELETE FROM testing_schema.testing_table WHERE example_integer = %(example_integer)s",
            params={'example_integer': example_integer}
        )

    db_connection._close_connection()

    pd.testing.assert_frame_equal(left=actual_df, right=expected_df)
예제 #13
0
def test_upload_new_data_only_to_existing_table_raises_exception_with_different_columns():
    """Exception is raised with helpful message if columns in dataframe are not identical to the target table."""

    db_connection = postgresql.DatabaseConnection()

    # Scenario 1, column exists in local dataframe but not target table
    scenario_1_rows_to_upload = pd.DataFrame(
        data={
            'example_string': ["New row"],
            'example_integer': [6],
            'example_timestamp': [datetime.datetime(2020, 11, 10, 15, 20, 37, 0)],
            'non_existent_column_in_target_table': [99]
        }
    )

    with pytest.raises(
            ValueError,
            match="The column names in the dataframe are not identical to that of the target table."
    ):
        db_connection.upload_new_data_only_to_existing_table(
            dataframe=scenario_1_rows_to_upload,
            table_name='testing_table',
            schema='testing_schema',
            id_column='example_integer'
        )

    # Scenario 2, column exists in target table but not local dataframe
    # example_timestamp column no longer included
    scenario_2_rows_to_upload = pd.DataFrame(
        data={
            'example_string': ["New row"],
            'example_integer': [6],
            # 'example_timestamp': [datetime.datetime(2020, 11, 10, 15, 20, 37, 0)],
        }
    )

    with pytest.raises(
            ValueError,
            match="The column names in the dataframe are not identical to that of the target table."
    ):
        db_connection.upload_new_data_only_to_existing_table(
            dataframe=scenario_2_rows_to_upload,
            table_name='testing_table',
            schema='testing_schema',
            id_column='example_integer'
        )
예제 #14
0
def test_get_column_names_existing_table():
    """The column names of an existing table are successfully retrieved."""

    db_connection = postgresql.DatabaseConnection()

    expected_column_names = ['example_integer', 'example_string', 'example_timestamp']

    actual_column_names = db_connection._get_column_names_existing_table(
        table_name='testing_table',
        schema='testing_schema'
    )

    assert all([actual == expected for actual, expected in zip(actual_column_names, expected_column_names)])

    # Appropriate exception is raised if column names cannot be retrieved
    with pytest.raises(expected_exception=pd.io.sql.DatabaseError, match='Execution failed on sql*'):

        db_connection._get_column_names_existing_table(table_name='non_existent_table', schema='testing_schema')
예제 #15
0
def test_record_columnists_recent_article_content():
    """
    The content of articles whose links are stored in the daily_mail.columnist_article_links table,
    is extracted and stored in daily_mail.article_content
    """

    article_downloader = daily_mail.ArticleDownloader()
    article_downloader.record_columnists_recent_article_content()

    # Inspect target table to check it was populated correctly with link from daily_mail.columnist_recent_article_links
    # which has not been scraped yet
    db_connection = postgresql.DatabaseConnection()
    db_connection._create_connection()

    with db_connection._conn.cursor() as curs:
        curs.execute('SELECT * FROM daily_mail.article_content;')

        table_tuples = curs.fetchall()
        actual_table = pd.DataFrame(table_tuples,
                                    columns=['id', 'url', 'title', 'content'])

        db_connection._conn.commit()

        # Tidy up and return table to its original form
        original_data = pd.read_csv(
            'Docker/db/staging_data/daily_mail.article_content.csv')
        original_urls = original_data['url'].values

        curs.execute(
            query=
            'DELETE FROM daily_mail.article_content WHERE url NOT IN %(original_urls)s',
            vars={'original_urls': tuple(original_urls)})

        db_connection._conn.commit()

    script_directory = os.path.dirname(os.path.abspath(__file__))
    expected_table = pd.read_csv(
        f'{script_directory}/daily_mail__record_columnists_recent_article_content__expected_output.csv'
    )

    pd.testing.assert_frame_equal(actual_table, expected_table)
예제 #16
0
def test_record_columnist_home_pages(monkeypatch):
    """Columnist names and their home page are pulled correctly and stored in postgres."""

    # Set up downloader but overwrite crawler with mock data
    article_downloader = i_news.ArticleDownloader()
    monkeypatch.setattr(requests, 'get', mock_all_columnists_homepage)

    article_downloader.record_columnist_home_pages()

    # Retrieve the table to see if it was populated correctly
    db_connection = postgresql.DatabaseConnection()
    db_connection._create_connection()

    with db_connection._conn.cursor() as curs:
        curs.execute('SELECT * FROM i_news.columnists;')

        table_tuples = curs.fetchall()
        actual_columnists = pd.DataFrame(table_tuples, columns=['columnist', 'homepage'])

        # Tidy up and return table to its original form
        curs.execute("TRUNCATE TABLE i_news.columnists;")
        curs.execute("COPY i_news.columnists FROM '/staging_data/i_news.columnists.csv' WITH CSV HEADER;")

        db_connection._conn.commit()

    db_connection._close_connection()

    expected_columnists = pd.DataFrame(data={
        'columnist': ['Fiona Mountford', 'Sarah Carson', 'Ayesha Hazarika', 'Alexander McCall Smith', 'Simon Kelner',
                      'Poorna Bell'],
        'homepage': ['https://inews.co.uk/author/fiona-mountford',
                     'https://inews.co.uk/author/sarah-carson',
                     'https://inews.co.uk/author/ayesha-hazarika',
                     'https://inews.co.uk/author/alexander-mccallsmith',
                     'https://inews.co.uk/author/simon-kelner',
                     'https://inews.co.uk/author/poorna-bell']
    })

    pd.testing.assert_frame_equal(actual_columnists, expected_columnists)
예제 #17
0
def test_get_min_or_max_from_column():
    """The minimum or maximum value from a column is returned."""

    db_connection = postgresql.DatabaseConnection()

    common_args = {'table_name': 'testing_table', 'schema': 'testing_schema', 'column': 'example_integer'}

    # Expects correct inputs
    with pytest.raises(ValueError, match="The `min_or_max` argument must be either 'min' or 'max'."):
        db_connection.get_min_or_max_from_column(**common_args, min_or_max='standard_dev')

    # Retrieves the minimum
    expected_min = 1

    actual_min = db_connection.get_min_or_max_from_column(**common_args, min_or_max='min')

    assert actual_min == expected_min

    # Retrieves the maximum
    expected_max = 2
    actual_max = db_connection.get_min_or_max_from_column(**common_args, min_or_max='max')

    assert actual_max == expected_max
예제 #18
0
def test_get_dataframe_demands_correct_arguments():
    """Exceptions or warnings should be raised if an incorrect combination of arguments is provided."""

    db_connection = postgresql.DatabaseConnection()

    # Scenario 1: Both query and table name are provided. Only one should be provided to either execute a query or
    # retrieve a full table.
    mock_query = "SELECT * FROM schema.table;"
    mock_table_name = "table"

    with pytest.raises(expected_exception=ValueError, match="Only one of `table_name` or `query` can be used."):
        db_connection.get_dataframe(query=mock_query, table_name=mock_table_name)

    # Scenario 2: Table name is provided but not the schema in which it resides.
    with pytest.raises(expected_exception=ValueError, match="Both a `schema` and `table_name` must be provided."):
        db_connection.get_dataframe(table_name=mock_table_name)

    # Scenario 3: Query parameters are provided but no query to use them in.
    with pytest.raises(
            expected_exception=ValueError,
            match="`query_params` have been provided but no `query` to use them in"
    ):
        db_connection.get_dataframe(query_params={'mock_arg': 'mock_value'})
예제 #19
0
    def __init__(
        self,
        batch_size: int = 1,
        number_of_processors: int = 1,
    ):
        """
        Initialise attributes of class.

        Parameters
        ----------
        batch_size : int (default 1)
            The number of texts to process at one time.

        number_of_processors : int (default 1)
            Number of processors used to to process texts in parallel. If set to -1, it will use all available CPUs
            (equivalent of `multiprocessing.cpu_count()`.
        """

        self._batch_size = batch_size
        self._number_of_processors = number_of_processors
        self._spacy_nlp = spacy.load(
            name='en_core_web_sm',
            disable=['ner', 'parser', 'tagger', 'textcat'])
        self._db_connection = postgresql.DatabaseConnection()

        self._daily_mail_db = {
            'schema': 'daily_mail',
            'raw_content': 'recent_article_content',
            'processed_content': 'recent_article_content_bow_processed'
        }

        self._guardian_db = {
            'schema': 'the_guardian',
            'raw_content': 'article_content',
            'processed_content': 'article_content_bow_processed'
        }
예제 #20
0
def test_record_opinion_articles_content():
    """
    The content of articles that have not already been pulled are collected and and saved to postgres.
    """

    article_downloader = the_guardian.ArticleDownloader()

    article_downloader.record_opinion_articles_content(number_of_articles=1)

    # Expected data should have been processed from most recent publication backwards
    expected_content = pd.DataFrame(
        data={
            'id': [
                'e8c5e312fae36c43d965a0e3da84e68d',
                '052015a6d57893adfa4be70521b1ad3b'
            ],
            'guardian_id': [
                'politics/1990/nov/23/past.conservatives',
                'world/2002/feb/25/race.uk'
            ],
            'web_publication_timestamp': [
                datetime.datetime(1990, 11, 23, 16, 47, 0, 0),
                datetime.datetime(2002, 2, 25, 1, 53, 0, 0)
            ],
            'api_url': [
                'https://content.guardianapis.com/politics/1990/nov/23/past.conservatives',
                'https://content.guardianapis.com/world/2002/feb/25/race.uk'
            ],
            # Include the first 89 characters of data we already have, and what we expect to see from the next article
            'content': [
                "• Margaret Thatcher, Britain's first female prime minister, resigned on 22 November 1990.",
                "About every three months I am accused of being an anti-semite. It is not difficult to pre"
            ]
        })

    db_connection = postgresql.DatabaseConnection()

    # Retrieve the table to see if it was populated correctly
    db_connection._create_connection()
    with db_connection._conn.cursor() as curs:
        curs.execute('SELECT * FROM the_guardian.article_content;')

        table_tuples = curs.fetchall()
        actual_content = pd.DataFrame(table_tuples,
                                      columns=[
                                          'id', 'guardian_id',
                                          'web_publication_timestamp',
                                          'api_url', 'content'
                                      ])

        # The content of articles are naturally a very large string, so only take the first characters to compare
        # against
        actual_content['content'] = actual_content['content'].apply(
            lambda x: x[:89])

        # Tidy up and revert to original table by deleting newly inserted rows
        df_article_metadata = pd.read_csv(
            'Docker/db/staging_data/the_guardian.article_metadata.csv')
        df_article_content = pd.read_csv(
            'Docker/db/staging_data/the_guardian.article_content.csv')
        new_rows_pulled = ~df_article_metadata['id'].isin(
            df_article_content['id'].values)
        new_ids_pulled = df_article_metadata.loc[new_rows_pulled, 'id'].values

        curs.execute(
            query=
            "DELETE FROM the_guardian.article_content WHERE id IN (%(new_ids_pulled)s);",
            vars={'new_ids_pulled': tuple(new_ids_pulled)})

        db_connection._conn.commit()
    db_connection._close_connection()

    pd.testing.assert_frame_equal(actual_content, expected_content)
예제 #21
0
    def test_store_most_similar_articles(self, similarity_threshold):
        """Appropriately similar articles are picked up and saved."""

        tfidf_encoder = encoding.TfidfEncoder()

        # Check how article pairs are found and saved to database
        tfidf_encoder.store_most_similar_articles(
            similarity_threshold=similarity_threshold)

        # The tf-idf representation of the data can be found in
        # Docker/db/staging_data/encoded_articles.tfidf_representation.csv and is stored in the table
        # encoded_articles.tfidf_representation

        # Using that mock data;
        # article 1 and 2 should be identical
        # article 3 and 4 are similar but not identical
        # article_5 is not similar to any

        db_connection = postgresql.DatabaseConnection()
        db_connection._create_connection()

        with db_connection._conn.cursor() as cursor:
            cursor.execute(
                "SELECT * FROM encoded_articles.tfidf_similar_articles;")

            table_tuples = cursor.fetchall()
            actual_article_pairs = pd.DataFrame(
                data=table_tuples,
                columns=['id', 'similar_article_id', 'similarity_score'])

            # Tidy up and revert to original version of table
            cursor.execute(
                "TRUNCATE TABLE encoded_articles.tfidf_similar_articles;")

            db_connection._conn.commit()

        # Check whether pairs found match what we expect
        if similarity_threshold == 0.5:

            expected_article_pairs = pd.DataFrame(
                columns=['id', 'similar_article_id', 'similarity_score'],
                data=[[
                    'article_1_d36c525d1679623119fd7a',
                    'article_2_4b2a76b9719d911017c592', 1.0
                ],
                      [
                          'article_2_4b2a76b9719d911017c592',
                          'article_1_d36c525d1679623119fd7a', 1.0
                      ],
                      [
                          'article_3_8350295550de7d587bc323',
                          'article_4_1702e282b59c30e3789ad4', 0.70710
                      ],
                      [
                          'article_4_1702e282b59c30e3789ad4',
                          'article_3_8350295550de7d587bc323', 0.70710
                      ]])

        # similarity_threshold == 0.8
        else:
            expected_article_pairs = pd.DataFrame(
                columns=['id', 'similar_article_id', 'similarity_score'],
                data=[
                    [
                        'article_1_d36c525d1679623119fd7a',
                        'article_2_4b2a76b9719d911017c592', 1.0
                    ],
                    [
                        'article_2_4b2a76b9719d911017c592',
                        'article_1_d36c525d1679623119fd7a', 1.0
                    ],
                ])

        # Account for the id column being listed as a CHAR(32) data type in the database so remove padded characters to
        # make it 32 characters
        actual_article_pairs['id'] = actual_article_pairs['id'].str.strip()
        actual_article_pairs['similar_article_id'] = actual_article_pairs[
            'similar_article_id'].str.strip()

        pd.testing.assert_frame_equal(actual_article_pairs,
                                      expected_article_pairs)
예제 #22
0
def test_record_columnist_home_pages(monkeypatch):
    """Columnist names and their home page are pulled correctly and stored in postgres."""

    # Set up downloader but overwrite crawler with mock data
    article_downloader = daily_mail.ArticleDownloader()
    monkeypatch.setattr(requests, 'get', mock_all_columnists_homepage)

    article_downloader.record_columnist_home_pages()

    # Retrieve the table to see if it was populated correctly
    db_connection = postgresql.DatabaseConnection()
    db_connection._create_connection()

    with db_connection._conn.cursor() as curs:
        curs.execute('SELECT * FROM daily_mail.columnists;')

        table_tuples = curs.fetchall()
        actual_columnists = pd.DataFrame(table_tuples,
                                         columns=['columnist', 'homepage'])

        # Tidy up and return table to its original form
        curs.execute("TRUNCATE TABLE daily_mail.columnists;")
        curs.execute(
            "COPY daily_mail.columnists FROM '/staging_data/daily_mail.columnists.csv' WITH CSV HEADER;"
        )

        db_connection._conn.commit()

    db_connection._close_connection()

    expected_columnists = pd.DataFrame(
        data={
            'columnist': [
                'Baz Bamigboye', 'Craig Brown', 'Alex Brummer',
                'Stephen Glover', 'Richard Kay', 'Ephraim Hardcastle',
                'Sebastian Shakespeare', 'Max Hastings', 'Dominic Lawson',
                'Richard Littlejohn', 'Peter Mckay', 'Jan Moir', 'Bel Mooney',
                'Andrew Pierce', 'Amanda Platell', 'Martin Samuel',
                'Ruth Sunderland', 'Tom Utley', 'Sarah Vine', 'Peter Hitchens',
                'Liz Jones', 'Black Dog', 'Oliver Holt'
            ],
            'homepage': [
                'https://www.dailymail.co.uk/tvshowbiz/columnist-1000601/Baz-Bamigboye-Daily-Mail.html',
                'https://www.dailymail.co.uk/home/books/columnist-1003951/Craig-Brown-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-1001421/Alex-Brummer-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-244/Stephen-Glover-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-230/Richard-Kay-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-250/Ephraim-Hardcastle-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-1092116/Sebastian-Shakespeare-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-464/Max-Hastings-Daily-Mail.html',
                'https://www.dailymail.co.uk/columnists/columnist-1083636/Dominic-Lawson-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-322/Richard-Littlejohn-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-227/Peter-McKay-Daily-Mail.html',
                'https://www.dailymail.co.uk/debate/columnist-1012602/Jan-Moir-Daily-Mail.html',
                'https://www.dailymail.co.uk/femail/columnist-465/Bel-Mooney-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-1041755/Andrew-Pierce-The-Mail-Sunday.html',
                'https://www.dailymail.co.uk/news/columnist-463/Amanda-Platell-The-Daily-Mail.html',
                'https://www.dailymail.co.uk/sport/columnist-1020688/Martin-Samuel-Sport-Daily-Mail.html',
                'https://www.dailymail.co.uk/columnists/columnist-1072434/Ruth-Sunderland-Daily-Mail.html',
                'https://www.dailymail.co.uk/news/columnist-1000961/Tom-Utley-Daily-Mail.html',
                'https://www.dailymail.co.uk/debate/columnist-1082216/Sarah-Vine-Daily-Mail.html',
                'https://www.dailymail.co.uk/debate/columnist-224/Peter-Hitchens-The-Mail-Sunday.html',
                'https://www.dailymail.co.uk/mailonsunday/columnist-1074669/Liz-Jones-Column-The-Mail-Sunday.html',
                'https://www.dailymail.co.uk/mailonsunday/columnist-249/Black-Dog-The-Mail-Sunday.html',
                'https://www.dailymail.co.uk/sport/columnist-1098989/Oliver-Holt-Mail-Sunday.html'
            ]
        })

    pd.testing.assert_frame_equal(actual_columnists, expected_columnists)
예제 #23
0
    def __init__(self):

        self._base_url = 'https://inews.co.uk/'
        self._columnist_section_url = 'https://inews.co.uk/category/opinion'
        self._db_connection = postgresql.DatabaseConnection()
예제 #24
0
    def __init__(self):

        self._base_url = 'https://www.dailymail.co.uk'
        self._columnist_section_url = 'https://www.dailymail.co.uk/columnists/index.html'
        self._db_connection = postgresql.DatabaseConnection()
예제 #25
0
    def __init__(self):

        self._api_key = os.getenv('GUARDIAN_API_KEY')
        self._db_connection = postgresql.DatabaseConnection()
        self._opinion_section_url = 'https://content.guardianapis.com/commentisfree/commentisfree'
예제 #26
0
def test_record_opinion_articles_metadata(monkeypatch,
                                          publication_start_timestamp):
    """
    Downloader iterates through pages and saves them to disk.
    """
    def mock_api_call(url: str, params: dict) -> Dict[str, Any]:
        """Mock functionality of making Guardian API call."""
        return {
            'response': {
                'status':
                'ok',
                'userTier':
                'developer',
                'total':
                100,
                'startIndex':
                1,
                'pageSize':
                2,
                'currentPage':
                1,
                'pages':
                1,
                'orderBy':
                'newest',
                'tag': {
                    'id':
                    'commentisfree/commentisfree',
                    'type':
                    'blog',
                    'sectionId':
                    'commentisfree',
                    'sectionName':
                    'Opinion',
                    'webTitle':
                    'Opinion',
                    'webUrl':
                    'https://www.theguardian.com/commentisfree/commentisfree',
                    'apiUrl':
                    'https://content.guardianapis.com/commentisfree/commentisfree'
                },
                'results': [{
                    'id':
                    'commentisfree/2020/oct/04/johnson-is-a-poor-prime-minister',
                    'type': 'article',
                    'sectionId': 'commentisfree',
                    'sectionName': 'Opinion',
                    'webPublicationDate': '2020-10-04T10:35:19Z',
                    'webTitle':
                    'Are Tory MPs really so surprised that Boris Johnson is a poor prime minister?',
                    'webUrl':
                    'https://www.theguardian.com/commentisfree/2020/oct/04/poor-prime-minister',
                    'apiUrl':
                    'https://content.guardianapis.com/commentisfree/2020/oct/04/poor-prime-minister',
                    'isHosted': False,
                    'pillarId': 'pillar/opinion',
                    'pillarName': 'Opinion'
                }, {
                    'id': 'commentisfree/2020/oct/04/university-in-a-pandemic',
                    'type': 'article',
                    'sectionId': 'commentisfree',
                    'sectionName': 'Opinion',
                    'webPublicationDate': '2020-10-04T07:30:45Z',
                    'webTitle':
                    'Up close the trials of university life in a pandemic. We should have done better.',
                    'webUrl':
                    'https://www.theguardian.com/commentisfree/2020/oct/04/university-in-a-pandemic',
                    'apiUrl':
                    'https://content.guardianapis.com/commentisfree/2020/oct/04/university-in-a-pandemic',
                    'isHosted': False,
                    'pillarId': 'pillar/opinion',
                    'pillarName': 'Opinion'
                }]
            }
        }

    # Set up downloader but overwrite API call with mock data
    article_downloader = the_guardian.ArticleDownloader()
    monkeypatch.setattr(article_downloader, "_call_api_and_display_exceptions",
                        mock_api_call)

    article_downloader.record_opinion_articles_metadata(
        publication_start_timestamp)

    expected_metadata = pd.DataFrame({
        'id': [
            'e8c5e312fae36c43d965a0e3da84e68d',
            '052015a6d57893adfa4be70521b1ad3b',
            '7d2669e5a86f5a5eb16862f691482fe3',
            '069738f52edca2125142e0952dbbfcc0'
        ],
        'guardian_id': [
            'politics/1990/nov/23/past.conservatives',
            'world/2002/feb/25/race.uk',
            'commentisfree/2020/oct/04/johnson-is-a-poor-prime-minister',
            'commentisfree/2020/oct/04/university-in-a-pandemic'
        ],
        'content_type': ['article', 'article', 'article', 'article'],
        'section_id':
        ['commentisfree', 'commentisfree', 'commentisfree', 'commentisfree'],
        'section_name': ['Opinion', 'Opinion', 'Opinion', 'Opinion'],
        'web_publication_timestamp': [
            datetime.datetime(1990, 11, 23, 16, 47, 0, 0),
            datetime.datetime(2002, 2, 25, 1, 53, 0, 0),
            datetime.datetime(2020, 10, 4, 10, 35, 19, 0),
            datetime.datetime(2020, 10, 4, 7, 30, 45, 0),
        ],
        'web_title': [
            'The Thatcher Years | Hugo Young', 'Gary Younge: Terms of abuse',
            'Are Tory MPs really so surprised that Boris Johnson is a poor prime minister?',
            'Up close the trials of university life in a pandemic. We should have done better.'
        ],
        'web_url': [
            'https://www.theguardian.com/politics/1990/nov/23/past.conservatives',
            'https://www.theguardian.com/world/2002/feb/25/race.uk',
            'https://www.theguardian.com/commentisfree/2020/oct/04/poor-prime-minister',
            'https://www.theguardian.com/commentisfree/2020/oct/04/university-in-a-pandemic'
        ],
        'api_url': [
            'https://content.guardianapis.com/politics/1990/nov/23/past.conservatives',
            'https://content.guardianapis.com/world/2002/feb/25/race.uk',
            'https://content.guardianapis.com/commentisfree/2020/oct/04/poor-prime-minister',
            'https://content.guardianapis.com/commentisfree/2020/oct/04/university-in-a-pandemic'
        ],
        'pillar_id': [
            'pillar/opinion', 'pillar/opinion', 'pillar/opinion',
            'pillar/opinion'
        ],
        'pillar_name': ['Opinion', 'Opinion', 'Opinion', 'Opinion']
    })

    db_connection = postgresql.DatabaseConnection()

    # Retrieve the table to see if it was populated correctly
    db_connection._create_connection()
    with db_connection._conn.cursor() as curs:
        curs.execute('SELECT * FROM the_guardian.article_metadata;')

        table_tuples = curs.fetchall()
        actual_metadata = pd.DataFrame(table_tuples,
                                       columns=[
                                           'id', 'guardian_id', 'content_type',
                                           'section_id', 'section_name',
                                           'web_publication_timestamp',
                                           'web_title', 'web_url', 'api_url',
                                           'pillar_id', 'pillar_name'
                                       ])

        # Tidy up and revert to original table by deleting newly inserted rows
        df_article_metadata_original = pd.read_csv(
            'Docker/db/staging_data/the_guardian.article_metadata.csv')
        new_rows_pulled = ~expected_metadata['id'].isin(
            df_article_metadata_original['id'].values)
        new_ids_pulled = expected_metadata.loc[new_rows_pulled, 'id'].values

        print()
        print('*' * 50)
        print(new_ids_pulled)

        curs.execute(
            query=
            "DELETE FROM the_guardian.article_metadata WHERE id IN %(new_ids_pulled)s;",
            vars={'new_ids_pulled': tuple(new_ids_pulled)})

        db_connection._conn.commit()
    db_connection._close_connection()

    pd.testing.assert_frame_equal(actual_metadata, expected_metadata)
예제 #27
0
    def test_encode_articles(self, use_existing_vocab):
        """
        Articles are represented with a tf-idf matrix either using a pre-existing dictionary or one which has been
        rebuilt.
        """

        db_connection = postgresql.DatabaseConnection()
        db_connection._create_connection()

        # Clear existing staging data
        with db_connection._conn.cursor() as cursor:
            cursor.execute(
                "TRUNCATE TABLE encoded_articles.tfidf_representation;")
            db_connection._conn.commit()

        # Load what we are expecting to see, which is the tf-idf representation of the only article which has already
        # been bag-of-words preprocessed in the staging data: daily_mail.article_content_bow_preprocessed
        script_directory = os.path.dirname(os.path.abspath(__file__))

        # Load expected tf-idf representation of the mock article. The csv files have been calculated following the
        # the sklearn documentation here:
        # https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

        # Scenario 1: Using an existing vocabulary (pulled from existing table)
        if use_existing_vocab:
            expected_tfidf = pd.read_csv(
                f'{script_directory}/test_encode_articles__expected_using_existing_vocab.csv'
            )

        # Scenario 2: Creating a new vocabulary from scratch
        else:
            expected_tfidf = pd.read_csv(
                f'{script_directory}/test_encode_articles__expected_not_using_existing_vocab.csv'
            )

        # Encode articles and check the output
        tfidf_encoder = encoding.TfidfEncoder(
            use_existing_vocab=use_existing_vocab)
        tfidf_encoder.encode_articles()

        with db_connection._conn.cursor() as cursor:
            cursor.execute(
                "SELECT * FROM encoded_articles.tfidf_representation;")

            table_tuples = cursor.fetchall()
            actual_tfidf = pd.DataFrame(data=table_tuples,
                                        columns=list(expected_tfidf.columns))

            db_connection._conn.commit()

        # Tidy up and revert to original version of tables
        with db_connection._conn.cursor() as cursor:

            # tf-idf matrix representation of each article
            cursor.execute(
                "TRUNCATE TABLE encoded_articles.tfidf_representation;")
            cursor.execute("""
                COPY encoded_articles.tfidf_representation
                FROM '/staging_data/encoded_articles.tfidf_representation.csv'
                WITH CSV HEADER;
                """)

            # Vocabulary table
            cursor.execute('TRUNCATE TABLE encoded_articles.tfidf_vocabulary;')
            cursor.execute("""
                COPY encoded_articles.tfidf_vocabulary
                FROM '/staging_data/encoded_articles.tfidf_vocabulary.csv'
                WITH CSV HEADER;
                """)

            db_connection._conn.commit()
        db_connection._close_connection()

        # The encoded values is loaded from the csv as a string, so convert back to an array
        expected_tfidf_vector = np.array(
            ast.literal_eval(expected_tfidf['encoded'][0]))

        assert actual_tfidf.index == expected_tfidf.index

        np.testing.assert_almost_equal(actual=actual_tfidf['encoded'][0],
                                       desired=expected_tfidf_vector,
                                       decimal=6)