import depresolve.depdata as depdata from deptools import spectuples_to_specstring depdata.ensure_data_loaded() new_deps = dict() for distkey in depdata.dependencies_by_dist: my_deps = depdata.dependencies_by_dist[distkey] new_deps[distkey] = [] for dep in my_deps: # for every one of its dependencies, satisfying_packagename = dep[0] spectuples = dep[1] specstring = '' # Catch case where there are new style dependencies among the old.... if type(spectuples) in [list, tuple]: specstring = spectuples_to_specstring(spectuples) else: assert type(spectuples) in [unicode, str], 'Unexpected dep format!' specstring = spectuples # It's already a specstring. new_deps[distkey].append([satisfying_packagename, specstring]) depdata.dependencies_by_dist = new_deps depdata.write_data_to_files()
def main(): # Some defaults: n_sdists_to_process = 0 # debug; max packages to explore during debug - overriden by --n=N argument. conflict_model = 3 no_skip = False careful_skip = False use_local_index = False use_local_index_old = False #run_all_conflicting = False # Files and directories. assert(os.path.exists(WORKING_DIRECTORY)), 'Working dir does not exist...??' # Ensure that appropriate directory for downloaded distros exists. # This would be terrible to duplicate if scraping a large number of packages. # One such sdist cache per system! Gets big. if not os.path.exists(TEMPDIR_FOR_DOWNLOADED_DISTROS): os.makedirs(TEMPDIR_FOR_DOWNLOADED_DISTROS) logger.info("scrape_deps_and_detect_conflicts - Version 0.5") distkeys_to_inspect_not_normalized = [] # not-yet-normalized user input, potentially filled with distkeys to check, from arguments distkeys_to_inspect = [] # list after argument normalization # Argument processing. # If we have arguments coming in, treat those as the packages to inspect. if len(sys.argv) > 1: for arg in sys.argv[1:]: if arg.startswith("--n="): n_sdists_to_process = int(arg[4:]) elif arg == "--cm1": conflict_model = 1 elif arg == "--cm2": conflict_model = 2 elif arg == "--cm3": conflict_model = 3 elif arg == "--noskip": no_skip = True elif arg == '--carefulskip': careful_skip = True elif arg == "--local-old": # without ='<directory>' means we pull alphabetically from local PyPI # mirror at /srv/pypi/ # Parse .tar.gz files as they appear in bandersnatch version <= 1.8 # For newer versions of bandersnatch, the sdist files are stored # differently (not in project-based directories) and so the argument # --local should be used instead. use_local_index_old = True elif arg == "--local": # without ='<directory>' means we pull from local PyPI mirror at # /srv/pypi/ # Parse .tar.gz files as they appear in bandersnatch version 1.11 # For bandersnatch 1.11, the sdist files are stored differently than in # <1.8. They are no longer kept in project-based directories). # If you are using a version of bandersnatch <=1.8, the argument # --local-old should be used instead. use_local_index = True #elif arg == '--conflicting': # # Operate locally and run on the distkeys provided in the indicated # # file, each on its own line. # use_local_index = True # run_all_conflicting = True else: distkeys_to_inspect_not_normalized.append(arg) # e.g. 'motorengine(0.7.4)' # For simplicity right now, I'll use one mode or another, not both. # Last arg has it if both. # Normalize any input distkeys we were given. for distkey in distkeys_to_inspect_not_normalized: assert '(' in distkey and distkey.endswith(')'), 'Invalid input.' distkey = depdata.normalize_distkey(distkey) distkeys_to_inspect.append(distkey) # Were we not given any distkeys to inspect? if not distkeys_to_inspect:# and not run_all_conflicting: if not use_local_index and not use_local_index_old: # If we're not using a local index, we have nothing to do. raise ValueError('You neither specified distributions to scrape nor ' '(alternatively) indicated that they should be chosen from a local ' 'mirror.') elif use_local_index_old: # If we were told to work with a local mirror, but weren't given specific # sdists to inspect, we'll scan everything in # BANDERSNATCH_MIRROR_SDIST_DIR until we have n_sdists_to_process sdists. # There is a better way to do this, but I'll leave this as is for now. # Ensure that the local PyPI mirror directory exists first. if not os.path.exists(BANDERSNATCH_MIRROR_SDIST_DIR): raise Exception('--- Exception. Expecting a bandersnatched mirror of ' 'PyPI at ' + BANDERSNATCH_MIRROR_SDIST_DIR + ' but that directory ' 'does not exist.') i = 0 for dir, subdirs, files in os.walk(BANDERSNATCH_MIRROR_SDIST_DIR): for fname in files: if is_sdist(fname): tarfilename_full = os.path.join(dir, fname) # Deduce package names and versions from sdist filename. distkey = get_distkey_from_full_filename(tarfilename_full) distkeys_to_inspect.append(distkey) i += 1 # awkward control structures, but saving debug run time. tidy later if i >= n_sdists_to_process: break if i >= n_sdists_to_process: break else: # use_local_index (modern bandersnatch version) assert use_local_index, 'Programming error.' # # sdists live here: /srv/pypi/web/packages/??/??/*/*.tar.gz # # Can implement this such that it checks those places. # for name1 in os.listdir(BANDERSNATCH_NEW_MIRROR_SDIST_DIR): # if len(name1) != 2: # continue # for name2 in os.listdir(os.path.join( # BANDERSNATCH_NEW_MIRROR_SDIST_DIR, name1)): # if len(name2) != 2: # continue # for name3 in os.listdir(os.path.join( # BANDERSNATCH_NEW_MIRROR_SDIST_DIR, name1, name2)): # if len(name3) != 60: # continue # for fname in os.listdir(): # #.... No, this is not going to unambiguously get me the package name # # in the way that it used to in older versions of bandersnatch. # # Rather than dealing with unexpected naming consequences, I'll go # # with the following even more annoying hack.... # A dictionary of all versions of all packages on the mirror, # collected out-of-band (via xml-rpc at same time as mirroring occurred). vbp_mirror = json.load(open('data/versions_by_package.json', 'r')) i = 0 for package in vbp_mirror: if i >= n_sdists_to_process: break for version in vbp_mirror[package]: if i >= n_sdists_to_process: break distkey = depdata.distkey_format(package, version) distkeys_to_inspect.append(distkey) i += 1 # We should now have distkeys to inspect (unless run_all_conflicting is True). # Load the dependencies, conflicts, and blacklist databases. # The blacklist is a list of runs that resulted in errors or runs that were # manually added because, for example, they hang seemingly forever or take an # inordinate length of time. depdata.ensure_data_loaded([conflict_model]) # Alias depdata.conflicts_db to the relevant conflicts db. (Ugly) depdata.set_conflict_model_legacy(conflict_model) # should remove this #if run_all_conflicting: # distkeys_to_inspect = [distkey for distkey in depdata.conflicts_3_db if # depdata.conflicts_3_db[distkey]] n_inspected = 0 n_successfully_processed = 0 last_wrote_at = 0 # Now take all of the distkeys ( e.g. 'python-twitter(0.2.1)' ) indicated and # run on them. for distkey in distkeys_to_inspect: # To avoid losing too much data, make sure we at least write data to disk # about every 100 successfully processed or 10000 inspected dists. Avoid # writing repeatedly in edge cases (e.g. when we write after 100 # successfully processed and then have to keep writing for every skip that # occurs after that. progress = n_inspected + n_successfully_processed * 100 if progress > last_wrote_at + 10000: last_wrote_at = progress logger.info("Writing early.") depdata.write_data_to_files([conflict_model]) # The skip conditions. # If dist is in the blacklist for the same version of python we're running. blacklisted = distkey in depdata.blacklist \ and sys.version_info.major in depdata.blacklist[distkey] # If dist has conflict info saved already already_in_conflicts = distkey in depdata.conflicts_db # Do we have dep info for the dist? Not a skip condition, but part of # careful_skip tests. already_in_dependencies = distkey in depdata.dependencies_by_dist # If we're not in no_skip mode, perform the skip checks. # Skip checks. If the dist is blacklisted or we already have dependency # data, then skip it - unless we're in careful skip mode and we don't # have dependency data for the dist. if not no_skip and (blacklisted or already_in_conflicts): # If dist isn't blacklisted, we already have conflict info, there's no # dependency info, and careful skip is on, don't actually skip. if careful_skip and not already_in_dependencies and not blacklisted: print('--- Not skipping ' + distkey + ': ' + 'Already have conflict data, however there is no dependency info ' 'for the dist, the dist is not blacklisted, and we are in ' 'careful_skip mode.') else: # Skip, since we don't have a reason not to. n_inspected += 1 print('--- SKIP -- ' + distkey + ': ' + 'Blacklisted. '*blacklisted + 'Already have conflict data. '*already_in_conflicts + '(Finished ' + str(n_inspected) + ' out of ' + str(len(distkeys_to_inspect)) + ')') continue # If we didn't skip, process the dist. packagename = depdata.get_packname(distkey) version_string = depdata.get_version(distkey) #assert(distkey.rfind(')') == len(distkey) - 1) formatted_requirement = packagename + "==" + version_string exitcode = None assert(conflict_model in [1, 2, 3]) # Construct the argument list. # Include argument to pass to pip to tell it not to prod users about our # strange pip version (lest they follow that instruction and install a # standard pip version): pip_arglist = [ 'install', '-d', TEMPDIR_FOR_DOWNLOADED_DISTROS, '--disable-pip-version-check', '--find-dep-conflicts', str(conflict_model), '--quiet'] if use_local_index: pip_arglist.extend(['-i', BANDERSNATCH_MIRROR_INDEX_DIR]) pip_arglist.append(formatted_requirement) # With arg list constructed, call pip.main with it to run a modified pip # install attempt (will not install). # This assumes that we're dealing with my pip fork version 8.0.0.dev0seb). print('--- Sending ' + distkey + ' to pip.') logger.debug('Scraper says: before pip call, len(deps) is ' + str(len(depdata.dependencies_by_dist))) # Call pip, with a 5 minute timeout. exitcode = None # scoping paranoia try: exitcode = _call_pip_with_timeout(pip_arglist) except timeout.TimeoutException as e: # This catch is not likely. See below logger.warning('pip timed out on dist ' + distkey + '(5min)!' ' Will treat as error. Exception follows: ' + str(e.args)) # Set the exit code to something other than 2 or 0 and it'll be treated # like any old pip error below, resulting in a blacklist. exitcode = 1000 # However, unfortunately, we cannot assume that pip will let that exception # pass up to us. It seems to take the signal, stop and clean up, and then # return exit code 2. This is fine, except that then we can't really # blacklist the process. I'd have to add a timer here, detect something # very close to the timeout, and guess that it timed out. /: That sucks. # In any case, we'll not learn that it's a process that times out, but # we'll just look at it as a possible conflict case. (The data recorded # will not list it as a conflict. Hopefully, that data is not corrupted. # It's unlikely that it would have been, though, so I judge this OK.) # Process the output of the pip command. if exitcode == 2: print('--- X SDist ' + distkey + ' : pip errored out (code=' + str(exitcode) + '). Possible DEPENDENCY CONFLICT. Result recorded in ' 'conflicts_<...>.json. (Finished ' + str(n_inspected) + ' out of ' + str(len(distkeys_to_inspect)) + ')') elif exitcode == 0: print('--- . SDist ' + distkey + ' : pip completed successfully. ' 'No dependency conflicts observed. (Finished ' + str(n_inspected) + ' out of ' + str(len(distkeys_to_inspect)) + ')') else: print('--- . SDist ' + distkey + ': pip errored out (code=' + str(exitcode) + '), but it seems to have been unrelated to any dep ' 'conflict.... (Finished ' + str(n_inspected) + ' out of ' + str(len(distkeys_to_inspect)) + ')') # Store in the list of failing packages along with the python version # we're running. (sys.version_info.major yields int 2 or 3) # Contents are to eventually be a list of the major versions in which it # fails. We should never get here if the dist is already in the blacklist # for this version of python, but let's keep going even if so. if distkey in depdata.blacklist and sys.version_info.major in \ depdata.blacklist[distkey] and not no_skip: logger.warning(' WARNING! This should not happen! ' + distkey + ' was' 'already in the blacklist for python ' + str(sys.version_info.major) + ', thus it should not have been run unless we have --noskip on ' '(which it is not)!') else: # Either the dist is not in the blacklist or it's not in the blacklist # for this version of python. (Sensible) if distkey not in depdata.blacklist: # depdata.blacklist[distkey] = [sys.version_info.major] logger.info(" Added entry to blacklist for " + distkey) else: assert(no_skip or sys.version_info.major not in depdata.blacklist[distkey]) depdata.blacklist[distkey].append(sys.version_info.major) logger.info(" Added additional entry to blacklist for " + distkey) # end of exit code processing n_inspected += 1 n_successfully_processed += 1 # end of for each tarfile/sdist # We're done with all packages. Write the collected data back to file. logger.debug("Writing.") depdata.write_data_to_files([conflict_model])
for distkey in depdata.dependencies_by_dist: my_deps = depdata.dependencies_by_dist[distkey] new_deps[distkey] = [] for dep in my_deps: # for every one of its dependencies, satisfying_packagename = dep[0] spectuples = dep[1] specstring = '' # Catch case where there are new style dependencies among the old.... if type(spectuples) in [list, tuple]: specstring = spectuples_to_specstring(spectuples) else: assert type(spectuples) in [unicode, str], 'Unexpected dep format!' specstring = spectuples # It's already a specstring. new_deps[distkey].append([satisfying_packagename, specstring]) depdata.dependencies_by_dist = new_deps depdata.write_data_to_files()