def find_versions_of_archive(archive_url, **kwargs): list_url = kwargs.get('list_url', None) list_depth = kwargs.get('list_depth', 1) if not list_url: list_url = os.path.dirname(archive_url) # This creates a regex from the URL with a capture group for the # version part of the URL. The capture group is converted to a # generic wildcard, so we can use this to extract things on a page # that look like archive URLs. url_regex = url.wildcard_version(archive_url) # We'll be a bit more liberal and just look for the archive part, # not the full path. archive_regex = os.path.basename(url_regex) # Grab some web pages to scrape. page_map = get_pages(list_url, depth=list_depth) # Build a version list from all the matches we find versions = VersionList() for site, page in page_map.iteritems(): # extract versions from matches. matches = re.finditer(archive_regex, page) version_strings = set(m.group(1) for m in matches) for v in version_strings: versions.add(Version(v)) return versions
def find_versions_of_archive(archive_url, **kwargs): list_url = kwargs.get('list_url', None) list_depth = kwargs.get('list_depth', 1) wildcard = kwargs.get('wildcard', None) if not list_url: list_url = os.path.dirname(archive_url) if not wildcard: wildcard = url.parse_version(archive_url).wildcard() versions = VersionList() url_regex = os.path.basename(url.wildcard_version(archive_url)) page_map = get_pages(list_url, depth=list_depth) for site, page in page_map.iteritems(): strings = re.findall(url_regex, page) for s in strings: match = re.search(wildcard, s) if match: v = match.group(0) versions.add(Version(v)) return versions