def __init__(self, dsn: str, querymgr: QueryManager, readonly: bool = True, autocommit: bool = False, application_name: Optional[str] = None) -> None: self._db = psycopg2.connect(dsn, application_name=application_name) self._db.set_session(readonly=readonly, autocommit=autocommit) querymgr.inject_queries(self, self._db)
def __init__(self, dsn: str, querymgr: QueryManager, readonly: bool = True, autocommit: bool = False, application_name: str | None = None) -> None: self._db = psycopg2.connect(dsn, application_name=application_name) self._db.set_session( readonly=readonly, autocommit=autocommit ) # type: ignore # broken typing stubs for psycopg2 querymgr.inject_queries(self, self._db)
def ProcessDatabase(options, logger, repoproc, repositories_updated): logger.Log('connecting to database') db_logger = logger.GetIndented() querymgr = QueryManager(options.sql_dir) database = Database(options.dsn, querymgr, readonly=False, application_name='repology-update') if options.initdb: db_logger.Log('(re)initializing database schema') database.create_schema() db_logger.Log('committing changes') database.commit() if options.database: db_logger.Log('clearing the database') database.update_start() package_queue = [] num_pushed = 0 start_time = timer() def PackageProcessor(packageset): nonlocal package_queue, num_pushed, start_time FillPackagesetVersions(packageset) package_queue.extend(packageset) if len(package_queue) >= 10000: database.add_packages(package_queue) num_pushed += len(package_queue) package_queue = [] db_logger.Log( ' pushed {} packages, {:.2f} packages/second'.format( num_pushed, num_pushed / (timer() - start_time))) db_logger.Log('pushing packages to database') repoproc.StreamDeserializeMulti(processor=PackageProcessor, reponames=options.reponames) # process what's left in the queue database.add_packages(package_queue) if options.fetch and options.update and options.parse: db_logger.Log('recording repo updates') database.mark_repositories_updated(repositories_updated) else: db_logger.Log( 'not recording repo updates, need --fetch --update --parse') db_logger.Log('updating views') database.update_finish() db_logger.Log('committing changes') database.commit() logger.Log('database processing complete')
def main() -> int: options = parse_arguments() querymgr = QueryManager(options.sql_dir) database = Database(options.dsn, querymgr, readonly=True, application_name='repology-benchmark') reference: Dict[str, float] = {} if options.load: try: with open(options.load, 'rb') as reffile: reference = pickle.load(reffile) except: pass results = [] for num, (method, name, kwargs) in enumerate(queries): if not check_keywords(name, options.keywords): continue print('===> {}/{}: "{}"\n'.format(num + 1, len(queries), name), file=sys.stderr, end='') results.append( (name, run_single_query(database, method, kwargs, options))) sys.stderr.flush() for name, delta in results: change = '' if name in reference: if max(delta, reference[name]) / min(delta, reference[name]) < ( 1 + options.epsilon): change = ' no change' elif delta > reference[name]: change = ' \033[0;91m{:.1f}% slower\033[0m'.format( 100.0 * delta / reference[name] - 100.0) else: change = ' \033[0;92m{:.1f}% faster\033[0m'.format( 100.0 * reference[name] / delta - 100.0) change += ' (was {:.2f}ms)'.format(reference[name] * 1000.0) print('{:>50s} {:.2f}ms{}'.format(name, delta * 1000.0, change), file=sys.stderr) if options.save: reference = {name: delta for name, delta in results} with open(options.save, 'wb') as reffile: pickle.dump(reference, reffile) return 0
def get_query_manager(self) -> QueryManager: return QueryManager(self.options.sql_dir)
def main() -> int: options = parse_arguments() querymgr = QueryManager(options.sql_dir) database = Database(options.dsn, querymgr, readonly=True, application_name='repology-gensitemap') urls: List[str] = [] if options.main: urls = ['/', '/news', '/statistics', '/about', '/api/v1', '/repositories/'] urls.extend(('/maintainer/' + name for name in database.get_all_maintainer_names())) urls.extend(('/repository/' + name for name in database.get_all_repository_names())) elif options.metapackages: links_per_metapackage = 3 print('Guessing threshold for important metapackages', file=sys.stderr) num_repos = 1 while True: num_metapackages = database.get_all_metapackage_names_by_min_spread_count(num_repos) num_urls_total = len(urls) + num_metapackages * links_per_metapackage print('Threshold = {}, {} metapackages, {} total urls'.format(num_repos, num_metapackages, num_urls_total), file=sys.stderr) if num_urls_total <= options.max_urls: print(' Looks good', file=sys.stderr) break if num_repos > 20: print(' Giving up, will truncate metapackage list', file=sys.stderr) break num_repos += 1 # get most important packages for name in database.get_all_metapackage_names_by_min_spread(num_repos, (options.max_urls - len(urls)) // links_per_metapackage): urls.append('/project/' + name + '/versions') urls.append('/project/' + name + '/packages') urls.append('/project/' + name + '/information') # fill the remaining space with less important packages for name in database.get_all_metapackage_names_by_spread(num_repos - 1, (options.max_urls - len(urls)) // links_per_metapackage): urls.append('/project/' + name + '/versions') urls.append('/project/' + name + '/packages') urls.append('/project/' + name + '/information') else: print('Please specify output mode', file=sys.stderr) shuffle(urls) # write XML print('Writing XML', file=sys.stderr) print('<?xml version="1.0" encoding="UTF-8"?>') print('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') for url in urls: print('<url><loc>' + html.escape(options.www_home + url) + '</loc><changefreq>daily</changefreq></url>') print('</urlset>') return 0
# repology is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with repology. If not, see <http://www.gnu.org/licenses/>. import flask from repology.config import config from repology.database import Database from repology.querymgr import QueryManager __all__ = [ 'get_db', ] _querymgr = QueryManager(config['SQL_DIR']) def get_db(): # XXX: this is not really a persistent DB connection! if not hasattr(flask.g, 'database'): flask.g.database = Database(config['DSN'], _querymgr, readonly=False, autocommit=True, application_name='repology-app') return flask.g.database
def Main(): options = ParseArguments() logger = FileLogger(options.logfile) if options.logfile else StderrLogger() querymgr = QueryManager(options.sql_dir) database = Database(options.dsn, querymgr, readonly=True, autocommit=True, application_name='repology-linkchecker/reader') readqueue = multiprocessing.Queue(10) writequeue = multiprocessing.Queue(10) writer = multiprocessing.Process(target=LinkUpdatingWorker, args=(writequeue, options, querymgr, logger)) writer.start() processpool = [multiprocessing.Process(target=LinkProcessingWorker, args=(readqueue, writequeue, i, options, logger)) for i in range(options.jobs)] for process in processpool: process.start() # base logger already passed to workers, may append prefix here logger = logger.GetPrefixed('master: ') prev_url = None while True: # Get pack of links logger.Log('Requesting pack of urls') urls = database.get_links_for_check( after=prev_url, prefix=options.prefix, # no limit by default limit=options.packsize, recheck_age=datetime.timedelta(seconds=options.age * 60 * 60 * 24), unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded ) if not urls: logger.Log(' No more urls to process') break # Get another pack of urls with the last hostname to ensure # that all urls for one hostname get into a same large pack match = re.match('([a-z]+://[^/]+/)', urls[-1]) if match: urls += database.get_links_for_check( after=urls[-1], prefix=match.group(1), recheck_age=datetime.timedelta(seconds=options.age * 60 * 60 * 24), unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded ) # Process if options.maxpacksize and len(urls) > options.maxpacksize: logger.Log('Skipping {} urls ({}..{}), exceeds max pack size'.format(len(urls), urls[0], urls[-1])) else: readqueue.put(urls) logger.Log('Enqueued {} urls ({}..{})'.format(len(urls), urls[0], urls[-1])) prev_url = urls[-1] logger.Log('Waiting for child processes to exit') # close workers for process in processpool: readqueue.put(None) for process in processpool: process.join() # close writer writequeue.put(None) writer.join() logger.Log('Done') return 0
def __init__(self, dsn, queriesdir, readonly=True, autocommit=False): self.db = psycopg2.connect(dsn) self.db.set_session(readonly=readonly, autocommit=autocommit) self.querymgr = QueryManager(queriesdir, self.db)
class Database: def __init__(self, dsn, queriesdir, readonly=True, autocommit=False): self.db = psycopg2.connect(dsn) self.db.set_session(readonly=readonly, autocommit=autocommit) self.querymgr = QueryManager(queriesdir, self.db) def Request(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) def RequestSingleValue(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) row = cursor.fetchone() if row is None: return None return row[0] def RequestSingleAsDict(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) row = cursor.fetchone() if row is None: return None names = [desc.name for desc in cursor.description] return dict(zip(names, row)) def RequestManyAsSingleColumnArray(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) return [row[0] for row in cursor.fetchall()] def RequestManyAsDictOfDicts(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) names = [desc.name for desc in cursor.description] return { row[0]: dict(zip(names[1:], row[1:])) for row in cursor.fetchall() } def RequestManyAsDicts(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) names = [desc.name for desc in cursor.description] return [dict(zip(names, row)) for row in cursor.fetchall()] def RequestManyAsPackages(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) names = [desc.name for desc in cursor.description] return [ Package(**dict(zip(names, row))) for row in cursor.fetchall() ] def CreateSchema(self): self.querymgr.create_schema() def Clear(self): self.querymgr.clear() def AddPackages(self, packages): with self.db.cursor() as cursor: cursor.executemany( """ INSERT INTO packages( repo, family, subrepo, name, effname, version, origversion, versionclass, maintainers, category, comment, homepage, licenses, downloads, flags, shadow, verfixed, flavors, extrafields ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) """, [( package.repo, package.family, package.subrepo, package.name, package.effname, package.version, package.origversion, package.versionclass, package.maintainers, package.category, package.comment, package.homepage, package.licenses, package.downloads, package.flags, package.shadow, package.verfixed, package.flavors, json.dumps(package.extrafields), ) for package in packages]) def MarkRepositoriesUpdated(self, reponames): with self.db.cursor() as cursor: cursor.executemany( """ INSERT INTO repositories ( name, last_update ) VALUES ( %s, now() ) ON CONFLICT (name) DO UPDATE SET last_update = now() """, [[name] for name in reponames]) def UpdateViews(self): self.querymgr.update_views() def Commit(self): self.db.commit() def GetMetapackage(self, names): return self.RequestManyAsPackages( """ SELECT repo, family, subrepo, name, effname, version, origversion, versionclass, maintainers, category, comment, homepage, licenses, downloads, flags, shadow, verfixed, flavors, extrafields FROM packages WHERE effname {} """.format('= ANY (%s)' if isinstance(names, list) else '= %s'), names) def GetMetapackages(self, request, limit=500): request.Limit(limit) query, args = request.GetQuery() return self.RequestManyAsPackages( """ SELECT repo, family, subrepo, name, effname, version, origversion, versionclass, maintainers, category, comment, homepage, licenses, downloads, flags, shadow, verfixed, flavors, extrafields FROM packages WHERE effname IN ( {} ) """.format(query), *args) def GetRelatedMetapackages(self, name, limit=500): return self.RequestManyAsPackages( """ SELECT repo, family, subrepo, name, effname, version, origversion, versionclass, maintainers, category, comment, homepage, licenses, downloads, flags, shadow, verfixed, flavors, extrafields FROM packages WHERE effname IN ( WITH RECURSIVE r AS ( SELECT effname, url FROM url_relations WHERE effname=%s UNION SELECT url_relations.effname, url_relations.url FROM url_relations JOIN r ON url_relations.effname = r.effname OR url_relations.url = r.url ) SELECT DISTINCT effname FROM r ORDER BY effname LIMIT %s ) """, name, limit) def GetPackagesCount(self): return self.RequestSingleValue( 'SELECT num_packages FROM statistics LIMIT 1') def GetMetapackagesCount(self): return self.RequestSingleValue( 'SELECT num_metapackages FROM statistics LIMIT 1') def GetMaintainersCount(self): return self.RequestSingleValue( 'SELECT num_maintainers FROM statistics LIMIT 1') def GetMaintainersRange(self): # could use min/max here, but these are slower on pgsql 9.6 return ( self.RequestSingleValue( 'SELECT maintainer FROM maintainers ORDER BY maintainer LIMIT 1' ), self.RequestSingleValue( 'SELECT maintainer FROM maintainers ORDER BY maintainer DESC LIMIT 1' )) def GetMaintainers(self, bound=None, reverse=False, search=None, limit=500): where = [] tail = '' args = [] order = 'maintainer' if bound: if reverse: where.append('maintainer <= %s') order = 'maintainer DESC' args.append(bound) else: where.append('maintainer >= %s') args.append(bound) if search: where.append('maintainer LIKE %s') args.append('%' + search + '%') if limit: tail = 'LIMIT %s' args.append(limit) return self.RequestManyAsDicts( """ SELECT * FROM ( SELECT maintainer, num_packages, num_metapackages, num_metapackages_outdated FROM maintainers {} ORDER BY {} {} ) AS tmp ORDER BY maintainer """.format('WHERE ' + ' AND '.join(where) if where else '', order, tail), *args) def GetMaintainerInformation(self, maintainer): return self.RequestSingleAsDict( """ SELECT num_packages, num_packages_newest, num_packages_outdated, num_packages_ignored, num_packages_unique, num_packages_devel, num_packages_legacy, num_packages_incorrect, num_packages_untrusted, num_packages_noscheme, num_packages_rolling, num_metapackages, num_metapackages_outdated, repository_package_counts, repository_metapackage_counts, category_metapackage_counts FROM maintainers WHERE maintainer = %s """, maintainer) def GetMaintainerMetapackages(self, maintainer, limit=1000): return self.RequestManyAsSingleColumnArray( """ SELECT effname FROM maintainer_metapackages WHERE maintainer = %s ORDER BY effname LIMIT %s """, maintainer, limit) def GetMaintainerSimilarMaintainers(self, maintainer, limit=100): # this obscure request needs some clarification # # what we calculate as score here is actually Jaccard index # (see wikipedia) for two sets (of metapackages maintained by # two maintainers) # # let M = set of metapackages for maintainer passed to this function # let C = set of metapackages for other maintainer we test for similarity # # score = |M⋂C| / |M⋃C| = |M⋂C| / (|M| + |C| - |M⋂C|) # # - num_metapackages_common is |M⋂C| # - num_metapackages is |C| # - sub-select just gets |M| # - the divisor thus is |M⋃C| = |M| + |C| - |M⋂C| return self.RequestManyAsDicts( """ SELECT maintainer, num_metapackages_common AS count, 100.0 * num_metapackages_common / ( num_metapackages - num_metapackages_common + ( SELECT num_metapackages FROM maintainers WHERE maintainer=%s ) ) AS match FROM ( SELECT maintainer, count(*) AS num_metapackages_common FROM maintainer_metapackages WHERE maintainer != %s AND effname IN ( SELECT effname FROM maintainer_metapackages WHERE maintainer=%s ) GROUP BY maintainer ) AS intersecting_counts INNER JOIN maintainers USING(maintainer) ORDER BY match DESC LIMIT %s """, maintainer, maintainer, maintainer, limit) def GetRepositories(self): return self.RequestManyAsDicts(""" SELECT name, num_packages, num_packages_newest, num_packages_outdated, num_packages_ignored, num_packages_unique, num_packages_devel, num_packages_legacy, num_packages_incorrect, num_packages_untrusted, num_packages_noscheme, num_packages_rolling, num_metapackages, num_metapackages_unique, num_metapackages_newest, num_metapackages_outdated, num_metapackages_comparable, last_update at time zone 'UTC' AS last_update_utc, now() - last_update AS since_last_update, num_problems, num_maintainers FROM repositories """) def GetRepository(self, repo): return self.RequestSingleAsDict( """ SELECT num_packages, num_packages_newest, num_packages_outdated, num_packages_ignored, num_packages_unique, num_packages_devel, num_packages_legacy, num_packages_incorrect, num_packages_untrusted, num_packages_noscheme, num_packages_rolling, num_metapackages, num_metapackages_unique, num_metapackages_newest, num_metapackages_outdated, num_metapackages_comparable, last_update at time zone 'UTC' AS last_update_utc, now() - last_update AS since_last_update, num_problems, num_maintainers FROM repositories WHERE name = %s """, repo, ) def GetRepositoriesHistoryAgo(self, seconds=60 * 60 * 24): return self.RequestSingleAsDict( """ SELECT ts AS timestamp, now() - ts AS timedelta, snapshot FROM repositories_history WHERE ts IN ( SELECT ts FROM repositories_history WHERE ts < now() - INTERVAL %s ORDER BY ts DESC LIMIT 1 ) """, datetime.timedelta(seconds=seconds), ) def GetRepositoriesHistoryPeriod(self, seconds=60 * 60 * 24, repo=None): repopath = '' repoargs = () if repo: repopath = '#>%s' repoargs = ('{' + repo + '}', ) return self.RequestManyAsDicts( """ SELECT ts AS timestamp, now() - ts AS timedelta, snapshot{} AS snapshot FROM repositories_history WHERE ts >= now() - INTERVAL %s ORDER BY ts """.format(repopath), *repoargs, datetime.timedelta(seconds=seconds)) def GetStatisticsHistoryPeriod(self, seconds=60 * 60 * 24): return self.RequestManyAsDicts( """ SELECT ts AS timestamp, now() - ts AS timedelta, snapshot FROM statistics_history WHERE ts >= now() - INTERVAL %s ORDER BY ts """, datetime.timedelta(seconds=seconds)) def Query(self, query, *args): with self.db.cursor() as cursor: cursor.execute(query, args) return cursor.fetchall() def SnapshotHistory(self): self.querymgr.snapshot_history() def ExtractLinks(self): self.querymgr.extract_links() def GetLinksForCheck(self, after=None, prefix=None, recheck_age=None, limit=None, unchecked_only=False, checked_only=False, failed_only=False, succeeded_only=False): conditions = [] args = [] # reduce the noise while linkchecker code doesn't support other schemas conditions.append('(url LIKE %s OR url LIKE %s)') args.append('http://%') args.append('https://%') if after is not None: conditions.append('url > %s') args.append(after) if prefix is not None: conditions.append('url LIKE %s') args.append(prefix + '%') if recheck_age is not None: conditions.append( '(last_checked IS NULL OR last_checked <= now() - INTERVAL %s)' ) args.append(datetime.timedelta(seconds=recheck_age)) if unchecked_only: conditions.append('last_checked IS NULL') if checked_only: conditions.append('last_checked IS NOT NULL') if failed_only: conditions.append('status != 200') if succeeded_only: conditions.append('status = 200') conditions_expr = '' limit_expr = '' if conditions: conditions_expr = 'WHERE ' + ' AND '.join(conditions) if limit: limit_expr = 'LIMIT %s' args.append(limit) return self.RequestManyAsSingleColumnArray( """ SELECT url FROM links {} ORDER BY url {} """.format(conditions_expr, limit_expr), *args) linkcheck_status_timeout = -1 linkcheck_status_too_many_redirects = -2 linkcheck_status_unknown_error = -3 linkcheck_status_cannot_connect = -4 linkcheck_status_invalid_url = -5 linkcheck_status_dns_error = -6 def UpdateLinkStatus(self, url, status, redirect=None, size=None, location=None): success = status == 200 self.Request( """ UPDATE links SET last_checked = now(), last_success = CASE WHEN %s THEN now() ELSE last_success END, last_failure = CASE WHEN %s THEN now() ELSE last_failure END, status = %s, redirect = %s, size = %s, location = %s WHERE url = %s """, success, not success, status, redirect, size, location, url) def GetMetapackageLinkStatuses(self, name): return self.RequestManyAsDictOfDicts( """ SELECT url, last_checked, last_success, last_failure, status, redirect, size, location FROM links WHERE url in ( -- this additional wrap seem to fix query planner somehow -- to use index scan on links instead of seq scan, which -- makes the query 100x faster; XXX: recheck with postgres 10 -- or report this? SELECT DISTINCT url from ( SELECT unnest(downloads) as url FROM packages WHERE effname = %s UNION SELECT homepage as url FROM packages WHERE homepage IS NOT NULL and effname = %s ) AS tmp ) """, name, name) def GetProblemsCount(self, repo=None, effname=None, maintainer=None): where_expr = '' args = [] conditions = [] if repo: conditions.append('repo = %s') args.append(repo) if effname: conditions.append('effname = %s') args.append(effname) if maintainer: conditions.append('maintainer = %s') args.append(maintainer) if conditions: where_expr = 'WHERE ' + ' AND '.join(conditions) return self.RequestSingleValue( """ SELECT count(*) FROM problems {} """.format(where_expr), *args) def GetProblems(self, repo=None, effname=None, maintainer=None, limit=None): # XXX: eliminate duplication with GetProblemsCount() where_expr = '' limit_expr = '' args = [] conditions = [] if repo: conditions.append('repo = %s') args.append(repo) if effname: conditions.append('effname = %s') args.append(effname) if maintainer: conditions.append('maintainer = %s') args.append(maintainer) if conditions: where_expr = 'WHERE ' + ' AND '.join(conditions) if limit: limit_expr = 'LIMIT %s' args.append(limit) return self.RequestManyAsDicts( """ SELECT repo, name, effname, maintainer, problem FROM problems {} ORDER by repo, effname, maintainer {} """.format(where_expr, limit_expr), *args) def AddReport(self, effname, need_verignore, need_split, need_merge, comment): self.Request( """ INSERT INTO reports ( created, effname, need_verignore, need_split, need_merge, comment ) VALUES ( now(), %s, %s, %s, %s, %s ) """, effname, need_verignore, need_split, need_merge, comment) def GetReportsCount(self, effname): return self.RequestSingleValue( 'SELECT count(*) FROM reports WHERE effname = %s', effname) def GetReports(self, effname): return self.RequestManyAsDicts( """ SELECT id, now() - created AS created_ago, effname, need_verignore, need_split, need_merge, comment, reply, accepted FROM reports WHERE effname = %s ORDER BY created DESC """, effname)