def do_fetch(self, statefile, logger): # Get and parse repomd.xml repomd_url = self.url + 'repodata/repomd.xml' logger.Log('fetching metadata from ' + repomd_url) repomd_content = do_http(repomd_url, check_status=True, timeout=self.fetch_timeout).text repomd_xml = xml.etree.ElementTree.fromstring(repomd_content) repodata_url = self.url + repomd_xml.find('{http://linux.duke.edu/metadata/repo}data[@type="primary"]/{http://linux.duke.edu/metadata/repo}location').attrib['href'] logger.Log('fetching ' + repodata_url) data = do_http(repodata_url, timeout=self.fetch_timeout).content logger.GetIndented().Log('size is {} byte(s)'.format(len(data))) if repodata_url.endswith('gz'): logger.GetIndented().Log('decompressing with gzip') data = gzip.decompress(data) elif repodata_url.endswith('xz'): logger.GetIndented().Log('decompressing with xz') data = lzma.LZMADecompressor().decompress(data) logger.GetIndented().Log('size after decompression is {} byte(s)'.format(len(data))) logger.GetIndented().Log('saving') statefile.write(data)
def do_fetch(self, statefile, logger): fetching_what = [self.url] if isinstance(self.post, dict): fetching_what.append('{} fields of form data'.format(len(self.post))) elif self.post: fetching_what.append('{} bytes of post data'.format(len(self.post))) if self.headers: fetching_what.append('{} extra headers'.format(len(self.headers))) logger.Log('fetching ' + ', with '.join(fetching_what)) data = do_http(self.url, data=self.post, headers=self.headers, timeout=self.fetch_timeout).content logger.GetIndented().Log('size is {} byte(s)'.format(len(data))) if self.compression == 'gz': logger.GetIndented().Log('decompressing with gzip') data = gzip.decompress(data) elif self.compression == 'bz2': logger.GetIndented().Log('decompressing with bz2') data = bz2.decompress(data) elif self.compression == 'xz': logger.GetIndented().Log('decompressing with xz') data = lzma.LZMADecompressor().decompress(data) if self.compression: logger.GetIndented().Log('size after decompression is {} byte(s)'.format(len(data))) logger.GetIndented().Log('saving') statefile.write(data)
def do_fetch(self, statedir, logger): for letter in ['0-9'] + [l for l in ascii_uppercase]: page = 1 numpages = 1 while True: logger.Log('fetching {} page {}'.format(letter, page)) pageurl = '{}/{}/page/{}/'.format(self.url, letter, page) # fetch HTML response = do_http(pageurl, timeout=self.fetch_timeout) response.encoding = 'utf-8' # is not detected properly text = response.text # get number of pages, if there are more than 1 of them if numpages == 1: for pagebutton in lxml.html.document_fromstring(text).xpath('.//nav[@class="page-selector"]/a'): numpages = max(numpages, int(pagebutton.text)) # save HTML with open(os.path.join(statedir, '{}-{}.html'.format(letter, page)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # end if that was last (or only) page if page >= numpages: break # proceed with the next page page += 1 if self.fetch_delay: time.sleep(self.fetch_delay)
def _do_fetch(self, statefile: AtomicFile, persdata: PersistentData, logger: Logger) -> bool: # fetch and parse repomd.xml repomd_url = self.url + 'repodata/repomd.xml' logger.log('fetching metadata from ' + repomd_url) repomd_content = do_http(repomd_url, check_status=True, timeout=self.fetch_timeout).text repomd = xml.etree.ElementTree.fromstring(repomd_content) repomd_elt_primary = repomd.find( '{http://linux.duke.edu/metadata/repo}data[@type="primary"]') if repomd_elt_primary is None: raise RuntimeError('Cannot find <primary> element in repomd.xml') repomd_elt_primary_location = repomd_elt_primary.find( './{http://linux.duke.edu/metadata/repo}location') repomd_elt_primary_checksum = repomd_elt_primary.find( './{http://linux.duke.edu/metadata/repo}open-checksum[@type="sha256"]' ) if repomd_elt_primary_checksum is None: logger.log('no supported checksum', Logger.WARNING) elif repomd_elt_primary_checksum.text == persdata.get( 'open-checksum-sha256'): logger.log('checksum not changed: {}'.format( repomd_elt_primary_checksum.text)) return False if repomd_elt_primary_location is None: raise RuntimeError('Cannot find <location> element in repomd.xml') repodata_url = self.url + repomd_elt_primary_location.attrib['href'] # fetch actual repo data compression = None if repodata_url.endswith('gz'): compression = 'gz' elif repodata_url.endswith('xz'): compression = 'xz' logger.log('fetching {}'.format(repodata_url)) save_http_stream(repodata_url, statefile.get_file(), compression=compression, timeout=self.fetch_timeout) if repomd_elt_primary_checksum is not None and repomd_elt_primary_checksum.text: persdata['open-checksum-sha256'] = repomd_elt_primary_checksum.text logger.log('saving checksum: {}'.format( persdata['open-checksum-sha256'])) logger.log('size is {} byte(s)'.format( os.path.getsize(statefile.get_path()))) return True
def _load_spec(self, package: str, statedir: AtomicDir, logger: Logger) -> None: specurl = self.giturl + '/{0}.git/plain/{0}.spec'.format(package) logger.get_indented().log('getting spec from {}'.format(specurl)) r = do_http(specurl, check_status=False) if r.status_code != 200: deadurl = self.giturl + '/{0}.git/plain/dead.package'.format( package) dr = do_http(deadurl, check_status=False) if dr.status_code == 200: logger.get_indented(2).log('dead: ' + ';'.join(dr.text.split('\n'))) else: logger.get_indented(2).log('failed: {}'.format( r.status_code)) # XXX: check .dead.package, instead throw return with open(os.path.join(statedir.get_path(), package + '.spec'), 'wb') as file: file.write(r.content)
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isfile(statepath) and not update: logger.log('no update requested, skipping') return False state: Dict[str, Any] = {} if os.path.isfile(statepath): with open(statepath, 'r', encoding='utf-8') as oldstatefile: state = json.load(oldstatefile) logger.log('loaded old state, {} entries'.format(len(state))) else: logger.log('starting with empty state') newdata = json.loads(do_http(self.url).text) if not newdata['releases']: raise RuntimeError( 'Empty freshcode package list received, refusing to go on') # add new entries in reversed order, oldest first so newest # have higher priority; may also compare versions here for entry in newdata['releases']: if 'name' not in entry: logger.log('skipping entry with no name') continue if entry['name'] in state: oldentry = state[entry['name']] if version_compare(entry['version'], oldentry['version']) > 0: logger.log( 'replacing entry "{}", version changed {} -> {}'. format(entry['name'], oldentry['version'], entry['version'])) state[entry['name']] = entry else: logger.log('adding entry "{}", version {}'.format( entry['name'], entry['version'])) state[entry['name']] = entry with AtomicFile(statepath, 'w', encoding='utf-8') as statefile: json.dump(state, statefile.get_file()) logger.log('saved new state, {} entries'.format(len(state))) return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: page = 1 while True: pageurl = self.apiurl + 'packages/?page={}'.format(page) logger.log('getting page {} from {}'.format(page, pageurl)) pagedata = json.loads(do_http(pageurl).text) for package in pagedata['packages']: self._load_spec(package['name'], statedir, logger) page += 1 if page > pagedata['page_total']: break return True
def do_fetch(self, statefile, logger): # fetch and parse repomd.xml repomd_url = self.url + 'repodata/repomd.xml' logger.Log('fetching metadata from ' + repomd_url) repomd_content = do_http(repomd_url, check_status=True, timeout=self.fetch_timeout).text repomd_xml = xml.etree.ElementTree.fromstring(repomd_content) repodata_url = self.url + repomd_xml.find('{http://linux.duke.edu/metadata/repo}data[@type="primary"]/{http://linux.duke.edu/metadata/repo}location').attrib['href'] # fetch actual repo data compression = None if repodata_url.endswith('gz'): compression = 'gz' elif repodata_url.endswith('xz'): compression = 'xz' logger.Log('fetching {}'.format(repodata_url)) save_http_stream(repodata_url, statefile, compression=compression, timeout=self.fetch_timeout) logger.Log('size is {} byte(s)'.format(statefile.tell()))
def fetch(*args, **kwargs): return do_http(*args, **kwargs)