def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isfile(statepath) and not update: logger.Log('no update requested, skipping') return # Get and parse repomd.xml repomd_url = self.url + 'repodata/repomd.xml' logger.Log('fetching metadata from ' + repomd_url) repomd_content = Fetch(repomd_url, check_status=True).text repomd_xml = xml.etree.ElementTree.fromstring(repomd_content) repodata_url = self.url + repomd_xml.find( '{http://linux.duke.edu/metadata/repo}data[@type="primary"]/{http://linux.duke.edu/metadata/repo}location' ).attrib['href'] logger.Log('fetching ' + repodata_url) data = Fetch(repodata_url).content logger.GetIndented().Log('size is {} byte(s)'.format(len(data))) if repodata_url.endswith('gz'): logger.GetIndented().Log('decompressing with gzip') data = gzip.decompress(data) elif repodata_url.endswith('xz'): logger.GetIndented().Log('decompressing with xz') data = lzma.LZMADecompressor().decompress(data) logger.GetIndented().Log( 'size after decompression is {} byte(s)'.format(len(data))) logger.GetIndented().Log('saving') with StateFile(statepath, 'wb') as statefile: statefile.write(data)
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: for letter in ['0-9'] + [l for l in ascii_uppercase]: page = 1 numpages = 1 while True: logger.Log('fetching {} page {}'.format(letter, page)) pageurl = '{}/{}/page/{}/'.format(self.url, letter, page) # fetch HTML response = Fetch(pageurl) response.encoding = 'utf-8' # is not detected properly text = response.text # get number of pages, if there are more than 1 of them if numpages == 1: for pagebutton in lxml.html.document_fromstring(text).xpath('.//nav[@class="page-selector"]/a'): numpages = max(numpages, int(pagebutton.text)) # save HTML with open(os.path.join(statedir, '{}-{}.html'.format(letter, page)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # end if that was last (or only) page if page >= numpages: break # proceed with the next page page += 1
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: numpage = 0 nextpageurl = self.url + 'Packages()?$filter=IsLatestVersion' while True: logger.Log('getting ' + nextpageurl) text = Fetch(nextpageurl, timeout=5).text with open(os.path.join(statedir, '{}.xml'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page logger.Log('parsing ' + nextpageurl) root = xml.etree.ElementTree.fromstring(text) next_link = root.find( '{http://www.w3.org/2005/Atom}link[@rel="next"]') if next_link is None: break nextpageurl = next_link.attrib['href'] numpage += 1
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isfile(statepath) and not update: logger.Log('no update requested, skipping') return logger.Log('fetching ' + self.url) data = Fetch(self.url).content logger.GetIndented().Log('size is {} byte(s)'.format(len(data))) if self.compression == 'gz': logger.GetIndented().Log('decompressing with gzip') data = gzip.decompress(data) elif self.compression == 'bz2': logger.GetIndented().Log('decompressing with bz2') data = bz2.decompress(data) elif self.compression == 'xz': logger.GetIndented().Log('decompressing with xz') data = lzma.LZMADecompressor().decompress(data) if self.compression: logger.GetIndented().Log('size after decompression is {} byte(s)'.format(len(data))) logger.GetIndented().Log('saving') with StateFile(statepath, 'wb') as statefile: statefile.write(data)
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: numpage = 1 while True: url = self.url + '?page={}&per_page={}&sort=alpha'.format( numpage, self.per_page) logger.Log('getting ' + url) text = Fetch(url, timeout=self.fetch_timeout).text with open(os.path.join(statedir, '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page if not json.loads(text)['crates']: logger.Log('last page detected') return numpage += 1 time.sleep(1)
def LoadSpec(self, package, statedir, logger): specurl = self.giturl + '/{0}.git/plain/{0}.spec'.format(package) logger.GetIndented().Log('getting spec from {}'.format(specurl)) r = Fetch(specurl, check_status=False) if r.status_code != 200: deadurl = self.giturl + '/{0}.git/plain/dead.package'.format(package) dr = Fetch(deadurl, check_status=False) if dr.status_code == 200: logger.GetIndented(2).Log('dead: ' + ';'.join(dr.text.split('\n'))) else: logger.GetIndented(2).Log('failed: {}'.format(r.status_code)) # XXX: check .dead.package, instead throw return with open(os.path.join(statedir, package + '.spec'), 'wb') as file: file.write(r.content)
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: packages_url = self.url + 'packages.gz' logger.GetIndented().Log('fetching package list from ' + packages_url) data = Fetch(packages_url).text # autogunzipped? package_names = [] for line in data.split('\n'): line = line.strip() if line.startswith('#') or line == '': continue package_names.append(line) if not package_names: raise RuntimeError( 'Empty package list received, refusing to continue') logger.GetIndented().Log('{} package name(s) parsed'.format( len(package_names))) pagesize = 100 for page in range(0, len(package_names) // pagesize + 1): ifrom = page * pagesize ito = (page + 1) * pagesize url = '&'.join([ 'arg[]=' + urllib.parse.quote(name) for name in package_names[ifrom:ito] ]) url = self.url + '/rpc/?v=5&type=info&' + url logger.GetIndented().Log('fetching page {}/{}'.format( page + 1, len(package_names) // pagesize + 1)) with open(os.path.join(statedir, '{}.json'.format(page)), 'wb') as statefile: statefile.write( Fetch(url, timeout=self.fetch_timeout).content)
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: pages = [chr(x) for x in range(ord('a'), ord('z') + 1)] # a..z pages.append('0-9') for page in pages: logger.Log('fetching page ' + page) pageurl = self.url + '/' + page + '.html' with open(os.path.join(statedir, page + '.html'), 'w', encoding='utf-8') as pagefile: pagefile.write(Fetch(pageurl).text)
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isfile(statepath) and not update: logger.Log('no update requested, skipping') return state = {} if os.path.isfile(statepath): with open(statepath, 'r', encoding='utf-8') as oldstatefile: state = json.load(oldstatefile) logger.Log('loaded old state, {} entries'.format(len(state))) else: logger.Log('starting with empty state') newdata = json.loads(Fetch(self.url).text) if not newdata['releases']: raise RuntimeError( 'Empty freshcode package list received, refusing to go on') # add new entries in reversed order, oldest first so newest # have higher priority; may also compare versions here for entry in newdata['releases']: if 'name' not in entry: logger.Log('skipping entry with no name') continue if entry['name'] in state: oldentry = state[entry['name']] if version_compare(entry['version'], oldentry['version']) > 0: logger.Log( 'replacing entry "{}", version changed {} -> {}'. format(entry['name'], oldentry['version'], entry['version'])) state[entry['name']] = entry else: logger.Log('adding entry "{}", version {}'.format( entry['name'], entry['version'])) state[entry['name']] = entry with StateFile(statepath, 'w', encoding='utf-8') as statefile: json.dump(state, statefile) logger.Log('saved new state, {} entries'.format(len(state)))
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: page = 1 while True: pageurl = self.apiurl + 'packages/?page={}'.format(page) logger.Log('getting page {} from {}'.format(page, pageurl)) pagedata = json.loads(Fetch(pageurl).text) for package in pagedata['packages']: self.LoadSpec(package['name'], statedir, logger) page += 1 if page > pagedata['page_total']: break
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isfile(statepath) and not update: logger.Log('no update requested, skipping') return fetching_what = [self.url] if isinstance(self.post, dict): fetching_what.append('{} fields of form data'.format(len( self.post))) elif self.post: fetching_what.append('{} bytes of post data'.format(len( self.post))) if self.headers: fetching_what.append('{} extra headers'.format(len(self.headers))) logger.Log('fetching ' + ', with '.join(fetching_what)) data = Fetch(self.url, post=self.post, headers=self.headers).content logger.GetIndented().Log('size is {} byte(s)'.format(len(data))) if self.compression == 'gz': logger.GetIndented().Log('decompressing with gzip') data = gzip.decompress(data) elif self.compression == 'bz2': logger.GetIndented().Log('decompressing with bz2') data = bz2.decompress(data) elif self.compression == 'xz': logger.GetIndented().Log('decompressing with xz') data = lzma.LZMADecompressor().decompress(data) if self.compression: logger.GetIndented().Log( 'size after decompression is {} byte(s)'.format(len(data))) logger.GetIndented().Log('saving') with StateFile(statepath, 'wb') as statefile: statefile.write(data)