def test_run_single_thread_override(self): test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path) test_obj.package_list = ['a', 'b', 'c'] with patch( 'pypianalyser.pypi_metadata_retriever.PyPiMetadataRetriever._threaded_process' ) as mock_tp: test_obj.run() mock_tp.assert_called_once_with(test_obj.package_list)
def test_truncate_description(self): test_obj = PyPiMetadataRetriever(trunc_description=10, db_path=self.temp_db_path) input_metadata = { 'info': { 'description': 'A' * 1000, 'summary': 'B' * 1000 } } test_obj._truncate_description(input_metadata) self.assertEqual(10, len(input_metadata['info']['description'])) self.assertEqual(10, len(input_metadata['info']['summary']))
def test_run_multi_threaded(self): test_obj = PyPiMetadataRetriever(thread_count=3, db_path=self.temp_db_path) expected_chunk1 = list('A' * 40) expected_chunk2 = list('B' * 40) expected_chunk3 = list('C' * 41) test_obj.package_list = expected_chunk1 + expected_chunk2 + expected_chunk3 with patch( 'pypianalyser.pypi_metadata_retriever.PyPiMetadataRetriever._threaded_process' ) as mock_tp: test_obj.run() mock_tp.assert_any_call(expected_chunk1) mock_tp.assert_any_call(expected_chunk2) mock_tp.assert_any_call(expected_chunk3)
def test_truncate_releases(self): test_obj = PyPiMetadataRetriever(trunc_releases=2, db_path=self.temp_db_path) input_metadata = \ {'releases': { '1.0.1': {}, '2.1.0': {}, '1.5.2': {} } } test_obj._truncate_releases(input_metadata) self.assertEqual(2, len(input_metadata['releases'])) self.assertListEqual(['2.1.0', '1.5.2'], list(input_metadata['releases'].keys()))
def test_truncate_releases_fallback(self): test_obj = PyPiMetadataRetriever(trunc_releases=1, db_path=self.temp_db_path) input_metadata = \ {'releases': { '1.0.1': [{'upload_time': '2016-02-19T13:08:33'}], '2.1.0': {}, '2.2.1dev': [{'upload_time': '2018-02-19T13:08:33'}], } } with patch('pypianalyser.pypi_metadata_retriever.LooseVersion', side_effect=TypeError): test_obj._truncate_releases(input_metadata) self.assertEqual(1, len(input_metadata['releases'])) self.assertListEqual(['2.2.1dev'], list(input_metadata['releases'].keys()))
def test_calculate_package_list(self): expected_result = ['aaa-123', 'aaa-789'] test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path, max_packages=2, package_regex='^(aaa-.*)|(ccc-.*)', file_404='404.txt') list_from_pypi = [ 'aaa-123', 'aaa-456', 'aaa-789', 'bbb-123', 'bbb-456', 'bbb-789', 'ccc-123', 'ccc-456', 'ccc-789' ] list_404 = ['aaa-456'] mock_db = MagicMock() mock_db.get_package_names.return_value = ['ccc-456', 'ccc-789'] with patch('pypianalyser.pypi_metadata_retriever.get_package_list', return_value=list_from_pypi), \ patch('pypianalyser.pypi_metadata_retriever.read_file_lines_into_list', return_value=list_404), \ patch('pypianalyser.pypi_metadata_retriever.PyPiAnalyserSqliteHelper', return_value=mock_db): actual_result = test_obj.calculate_package_list() self.assertListEqual(expected_result, actual_result)
def test_test_threaded_process_commit_to_db(self): test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path) mock_metadata = { 'info': { 'description': 'A' * 1000, 'summary': 'B' * 1000 } } mock_db = MagicMock() with patch('pypianalyser.pypi_metadata_retriever.get_metadata_for_package', return_value=mock_metadata),\ patch('pypianalyser.pypi_metadata_retriever.PyPiAnalyserSqliteHelper', return_value=mock_db): test_obj._open_db() test_obj._threaded_process(['a']) mock_db.commit_package_to_db.assert_called_once_with(mock_metadata)
def main(): parser = argparse.ArgumentParser('Script to download PyPi metadata into an SQLite database for easy querying.') parser.add_argument('-td', '--trunc_descriptions', help='Truncate the description field to X characters to reduce the size of the database. Use ' '-1 for no truncation. Default is 500', type=int, default=500) parser.add_argument('-tr', '--trunc_releases', help='Specify the maximum number of releases to store in the database for each package. In many' ' cases you may only be interested in the latest one or two releases. Use -1 for no ' 'truncation. Default is 2', type=int, default=2) parser.add_argument('-t', '--threads', help='Number of threads to spawn to download the metadata. Default is 5', type=int, default=5) parser.add_argument('-db', '--database_path', help='Name or path of the database to store the metadata in. If the database already exists ' 'with entries then it will be read and only packages that are missing from the database ' 'will be retrieved. This allows you to download the PyPi mirror metadata over a few runs ' 'rather than a single one. Default is pypi_metadata.sqlite', default='pypi_metadata.sqlite') parser.add_argument('-m', '--max_packages', help='Maximum number of packages to retrieve the metadata for. Using this allows you download' ' the metadata over a series of runs rather than spamming PyPi and your network.', type=int) parser.add_argument('-pr', '--package_regex', help='Specify a regex to match package names against. Only those that match will be retrieved. ' 'NOTE: all package names are normalized before this, whereby characters a lowercased and ' 'underscores are replaced with hyphens. E.g. ^robotframework-.*') parser.add_argument('-404', '--file_404_list', help='Path to a file to store a list of package names that returned a HTTP 404. This usually' ' means that the package no longer exists in PyPi. The file is useful for doing future ' 'runs. Default: 404.txt', default='404.txt') parser.add_argument('--dry_run', action='store_true', help='Dry run mode that gives insight into the package metadata that will be retrieved. This ' 'mode obtains the package list from the index, removes any packages that are already ' 'present the database (if it exists). If the list of package URLs that returned 404 on ' 'previous runs exists, these will also be removed from the set. Finally the regex will be ' 'applied to the remaining packages. If this list is less than 100 then its printed to the' ' console. The entire list will be written out to a file called dry_run_package_list.txt') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose mode.') parsed_args = parser.parse_args() retriever = PyPiMetadataRetriever(parsed_args.trunc_descriptions, parsed_args.trunc_releases, parsed_args.threads, parsed_args.database_path, parsed_args.max_packages, parsed_args.package_regex, parsed_args.file_404_list, parsed_args.verbose) if parsed_args.dry_run: package_list = retriever.calculate_package_list() with open('dry_run_package_list.txt', 'w', encoding='utf-8') as fp: fp.write(u'\n'.join(package_list)) logger.info('Dry run has calculated {} packages that would be processed. This list has been output to ' 'dry_run_package_list.txt'.format(len(package_list))) else: retriever.run()
def test_threaded_process_exception_reported_on_404(self): test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path) with patch('pypianalyser.pypi_metadata_retriever.get_metadata_for_package', side_effect=Exception404('err')),\ patch('pypianalyser.pypi_metadata_retriever.PyPiMetadataRetriever._report_404') as mock_r404: test_obj._threaded_process(['a']) mock_r404.assert_called_once_with('a')