示例#1
0
 def test_run_single_thread_override(self):
     test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path)
     test_obj.package_list = ['a', 'b', 'c']
     with patch(
             'pypianalyser.pypi_metadata_retriever.PyPiMetadataRetriever._threaded_process'
     ) as mock_tp:
         test_obj.run()
     mock_tp.assert_called_once_with(test_obj.package_list)
示例#2
0
 def test_truncate_description(self):
     test_obj = PyPiMetadataRetriever(trunc_description=10,
                                      db_path=self.temp_db_path)
     input_metadata = {
         'info': {
             'description': 'A' * 1000,
             'summary': 'B' * 1000
         }
     }
     test_obj._truncate_description(input_metadata)
     self.assertEqual(10, len(input_metadata['info']['description']))
     self.assertEqual(10, len(input_metadata['info']['summary']))
示例#3
0
    def test_run_multi_threaded(self):
        test_obj = PyPiMetadataRetriever(thread_count=3,
                                         db_path=self.temp_db_path)
        expected_chunk1 = list('A' * 40)
        expected_chunk2 = list('B' * 40)
        expected_chunk3 = list('C' * 41)

        test_obj.package_list = expected_chunk1 + expected_chunk2 + expected_chunk3
        with patch(
                'pypianalyser.pypi_metadata_retriever.PyPiMetadataRetriever._threaded_process'
        ) as mock_tp:
            test_obj.run()
            mock_tp.assert_any_call(expected_chunk1)
            mock_tp.assert_any_call(expected_chunk2)
            mock_tp.assert_any_call(expected_chunk3)
示例#4
0
 def test_truncate_releases(self):
     test_obj = PyPiMetadataRetriever(trunc_releases=2,
                                      db_path=self.temp_db_path)
     input_metadata = \
         {'releases':
             {
                 '1.0.1': {},
                 '2.1.0': {},
                 '1.5.2': {}
             }
         }
     test_obj._truncate_releases(input_metadata)
     self.assertEqual(2, len(input_metadata['releases']))
     self.assertListEqual(['2.1.0', '1.5.2'],
                          list(input_metadata['releases'].keys()))
示例#5
0
 def test_truncate_releases_fallback(self):
     test_obj = PyPiMetadataRetriever(trunc_releases=1,
                                      db_path=self.temp_db_path)
     input_metadata = \
         {'releases':
             {
                 '1.0.1': [{'upload_time': '2016-02-19T13:08:33'}],
                 '2.1.0': {},
                 '2.2.1dev': [{'upload_time': '2018-02-19T13:08:33'}],
             }
         }
     with patch('pypianalyser.pypi_metadata_retriever.LooseVersion',
                side_effect=TypeError):
         test_obj._truncate_releases(input_metadata)
     self.assertEqual(1, len(input_metadata['releases']))
     self.assertListEqual(['2.2.1dev'],
                          list(input_metadata['releases'].keys()))
示例#6
0
    def test_calculate_package_list(self):
        expected_result = ['aaa-123', 'aaa-789']
        test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path,
                                         max_packages=2,
                                         package_regex='^(aaa-.*)|(ccc-.*)',
                                         file_404='404.txt')

        list_from_pypi = [
            'aaa-123', 'aaa-456', 'aaa-789', 'bbb-123', 'bbb-456', 'bbb-789',
            'ccc-123', 'ccc-456', 'ccc-789'
        ]
        list_404 = ['aaa-456']
        mock_db = MagicMock()
        mock_db.get_package_names.return_value = ['ccc-456', 'ccc-789']
        with patch('pypianalyser.pypi_metadata_retriever.get_package_list', return_value=list_from_pypi), \
             patch('pypianalyser.pypi_metadata_retriever.read_file_lines_into_list', return_value=list_404), \
             patch('pypianalyser.pypi_metadata_retriever.PyPiAnalyserSqliteHelper', return_value=mock_db):
            actual_result = test_obj.calculate_package_list()
        self.assertListEqual(expected_result, actual_result)
示例#7
0
 def test_test_threaded_process_commit_to_db(self):
     test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path)
     mock_metadata = {
         'info': {
             'description': 'A' * 1000,
             'summary': 'B' * 1000
         }
     }
     mock_db = MagicMock()
     with patch('pypianalyser.pypi_metadata_retriever.get_metadata_for_package', return_value=mock_metadata),\
          patch('pypianalyser.pypi_metadata_retriever.PyPiAnalyserSqliteHelper', return_value=mock_db):
         test_obj._open_db()
         test_obj._threaded_process(['a'])
         mock_db.commit_package_to_db.assert_called_once_with(mock_metadata)
示例#8
0
def main():
    parser = argparse.ArgumentParser('Script to download PyPi metadata into an SQLite database for easy querying.')
    parser.add_argument('-td', '--trunc_descriptions',
                        help='Truncate the description field to X characters to reduce the size of the database. Use '
                             '-1 for no truncation. Default is 500',
                        type=int,
                        default=500)
    parser.add_argument('-tr', '--trunc_releases',
                        help='Specify the maximum number of releases to store in the database for each package. In many'
                             ' cases you may only be interested in the latest one or two releases. Use -1 for no '
                             'truncation. Default is 2',
                        type=int,
                        default=2)
    parser.add_argument('-t', '--threads',
                        help='Number of threads to spawn to download the metadata. Default is 5',
                        type=int,
                        default=5)
    parser.add_argument('-db', '--database_path',
                        help='Name or path of the database to store the metadata in. If the database already exists '
                             'with entries then it will be read and only packages that are missing from the database '
                             'will be retrieved. This allows you to download the PyPi mirror metadata over a few runs '
                             'rather than a single one. Default is pypi_metadata.sqlite',
                        default='pypi_metadata.sqlite')
    parser.add_argument('-m', '--max_packages',
                        help='Maximum number of packages to retrieve the metadata for. Using this allows you download'
                             ' the metadata over a series of runs rather than spamming PyPi and your network.',
                        type=int)
    parser.add_argument('-pr', '--package_regex',
                        help='Specify a regex to match package names against. Only those that match will be retrieved. '
                             'NOTE: all package names are normalized before this, whereby characters a lowercased and '
                             'underscores are replaced with hyphens. E.g. ^robotframework-.*')
    parser.add_argument('-404', '--file_404_list',
                        help='Path to a file to store a list of package names that returned a HTTP 404. This usually'
                             ' means that the package no longer exists in PyPi. The file is useful for doing future '
                             'runs. Default: 404.txt',
                        default='404.txt')
    parser.add_argument('--dry_run', action='store_true',
                        help='Dry run mode that gives insight into the package metadata that will be retrieved. This '
                             'mode obtains the package list from the index, removes any packages that are already '
                             'present the database (if it exists). If the list of package URLs that returned 404 on '
                             'previous runs exists, these will also be removed from the set. Finally the regex will be '
                             'applied to the remaining packages. If this list is less than 100 then its printed to the'
                             ' console. The entire list will be written out to a file called dry_run_package_list.txt')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='Verbose mode.')

    parsed_args = parser.parse_args()
    retriever = PyPiMetadataRetriever(parsed_args.trunc_descriptions,
                                      parsed_args.trunc_releases,
                                      parsed_args.threads,
                                      parsed_args.database_path,
                                      parsed_args.max_packages,
                                      parsed_args.package_regex,
                                      parsed_args.file_404_list,
                                      parsed_args.verbose)

    if parsed_args.dry_run:
        package_list = retriever.calculate_package_list()
        with open('dry_run_package_list.txt', 'w', encoding='utf-8') as fp:
            fp.write(u'\n'.join(package_list))

        logger.info('Dry run has calculated {} packages that would be processed. This list has been output to '
                    'dry_run_package_list.txt'.format(len(package_list)))
    else:
        retriever.run()
示例#9
0
 def test_threaded_process_exception_reported_on_404(self):
     test_obj = PyPiMetadataRetriever(db_path=self.temp_db_path)
     with patch('pypianalyser.pypi_metadata_retriever.get_metadata_for_package', side_effect=Exception404('err')),\
          patch('pypianalyser.pypi_metadata_retriever.PyPiMetadataRetriever._report_404') as mock_r404:
         test_obj._threaded_process(['a'])
         mock_r404.assert_called_once_with('a')