def main(): for p in Package.select().where(Package.readme != ''): readme_text = p.readme html = markdown.markdown(readme_text) soup = BeautifulSoup(html, 'html.parser') # This is a heuristic for word-count. # It will be not be precisely correct, depending on your definition of word. # For example, a path like 'com.app.example' is split into three words here. word_count = len(re.findall('\w+', soup.text)) # Another heuristic. As it's typical that inline code examples occur in <pre> # blocks, especially in formatted markdown, we count code blocks based # on the appearance of <pre> tags. code_blocks = soup.find_all('pre') block_count = len(code_blocks) try: analysis = ReadmeAnalysis.get(ReadmeAnalysis.package == p) except ReadmeAnalysis.DoesNotExist: analysis = ReadmeAnalysis.create( package=p, code_count=block_count, word_count=word_count ) logging.debug("Created README analysis for package %s", p.name) else: analysis.code_count = block_count analysis.word_count = word_count analysis.save() logging.debug("Updated README analysis for package %s", p.name)
def main(): for p in Package.select().where(Package.readme != ''): # This is a heuristic for word-count. # It will be not be precisely correct, depending on your definition of word. # For example, a path like 'com.app.example' is split into three words here. word_count = len(re.findall('\w+', p.readme)) # Another heuristic. # In reStructuredText (reST), code blocks are introduced by ending a paragraph # with a special marker ::. The block must be indented and separated from the # surrounding paragraphs by blank lines. Thus, there must be at least two new line # characters after the special marker ::. # This may prove to be a broken heuristic. In that case, consider using Sphinx: # http://www.sphinx-doc.org/en/stable/index.html. block_count = len(re.findall('::.*\\n\\n', p.readme)) try: analysis = ReadmeAnalysis.get(ReadmeAnalysis.package == p) except ReadmeAnalysis.DoesNotExist: analysis = ReadmeAnalysis.create( package=p, code_count=block_count, word_count=word_count ) logging.debug("Created README analysis for package %s", p.name) else: analysis.code_count = block_count analysis.word_count = word_count analysis.save() logging.debug("Updated README analysis for package %s", p.name) logging.info("Finished analyzing READMEs.")
if __name__ == '__main__': parser = argparse.ArgumentParser(description="Download package stats for PyPI") parser.add_argument( '--package-list', action='store_true', help="fetch list of all packages on PyPI" ) parser.add_argument( '--pypi-data', action='store_true', help="fetch PyPI data (READMES and downloads)" ) parser.add_argument( '--update', action='store_true', help="only update existing data (currently only for --pypi-data)" ) args = parser.parse_args() if args.package_list: create_tables() fetch_package_list() if args.pypi_data: if args.update: packages = Package.select().where(Package.description != '') else: packages = Package.select().where(Package.readme >> None) fetch_pypi_data(packages)
help="how many package names to fetch" ) parser.add_argument( '--github-readmes', action='store_true', help="fetch Github READMEs" ) parser.add_argument( '--github-stats', action='store_true', help="fetch Github stats" ) args = parser.parse_args() if args.package_list: create_tables() fetch_package_list() if args.npm_data: if args.update: packages = Package.select().where(Package.description != '') else: packages = Package.select().where(Package.readme >> None).order_by(fn.Random()) fetch_npm_data(packages) if args.lib_packages: create_tables() fetch_packagenames_from_libraryio(args.lib_package_count) if args.github_readmes: fetch_github_readmes(Package.select()) if args.github_stats: fetch_github_stats(Package.select())