def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: numpage = 0 nextpageurl = self.url + 'Packages()?$filter=IsLatestVersion' while True: logger.log('getting ' + nextpageurl) text = self.do_http(nextpageurl).text with open(os.path.join(statedir.get_path(), '{}.xml'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page logger.log('parsing ' + nextpageurl) root = xml.etree.ElementTree.fromstring(text) next_link = root.find( '{http://www.w3.org/2005/Atom}link[@rel="next"]') if next_link is None: break nextpageurl = next_link.attrib['href'] numpage += 1 return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: packages_url = self.url + 'packages.gz' logger.get_indented().log('fetching package list from ' + packages_url) data = self.do_http(packages_url).text # autogunzipped? package_names = [] for line in data.split('\n'): line = line.strip() if line.startswith('#') or line == '': continue package_names.append(line) if not package_names: raise RuntimeError('Empty package list received, refusing to continue') logger.get_indented().log('{} package name(s) parsed'.format(len(package_names))) for num_page, (url, num_packages) in enumerate(_split_names_into_urls(self.url + '/rpc/?v=5&type=info', package_names, self.max_api_url_length)): logger.get_indented().log('fetching page {} of {} package(s)'.format(num_page + 1, num_packages)) with open(os.path.join(statedir.get_path(), '{}.json'.format(num_page)), 'wb') as statefile: statefile.write(self.do_http(url).content) return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: tarpath = os.path.join(statedir.get_path(), '.temporary.tar') headers = {} if persdata.get('last-modified'): headers['if-modified-since'] = persdata.get('last-modified') logger.log('using if-modified-since: {}'.format( headers['if-modified-since'])) logger.log('fetching {}'.format(self.url)) try: with open(tarpath, 'wb') as tarfile: response = save_http_stream(self.url, tarfile, headers=headers, timeout=self.fetch_timeout) except NotModifiedException: logger.log('got 403 not modified') return False # XXX: may be unportable, FreeBSD tar automatically handles compression type, # may not be the case on linuxes # XXX: this extracts tarball permissions, which is not desirable and it may # produce non-readable files and dirs (blackarch). GNU tar has --mode, BSD tar # lacks this. We should probably require GNU tar, and handle binary name which # may differ on BSD. run_subprocess( ['tar', '-x', '-z', '-f', tarpath, '-C', statedir.get_path()], logger) os.remove(tarpath) if response.headers.get('last-modified'): persdata['last-modified'] = response.headers['last-modified'] logger.log('storing last-modified: {}'.format( persdata['last-modified'])) return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: tarpath = os.path.join(statedir.get_path(), '.temporary.tar') headers = {} if persdata.get('last-modified'): headers['if-modified-since'] = persdata.get('last-modified') logger.log('using if-modified-since: {}'.format( headers['if-modified-since'])) logger.log('fetching {}'.format(self.url)) try: with open(tarpath, 'wb') as tarfile: response = save_http_stream(self.url, tarfile, headers=headers, timeout=self.fetch_timeout) except NotModifiedException: logger.log('got 403 not modified') return False # XXX: may be unportable, FreeBSD tar automatically handles compression type, # may not be the case on linuxes run_subprocess( ['tar', '-x', '-z', '-f', tarpath, '-C', statedir.get_path()], logger) os.remove(tarpath) if response.headers.get('last-modified'): persdata['last-modified'] = response.headers['last-modified'] logger.log('storing last-modified: {}'.format( persdata['last-modified'])) return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: numpage = 1 while True: url = self.url + '?page={}&per_page={}&sort=alpha'.format(numpage, self.per_page) logger.log('getting ' + url) text = self.do_http(url).text with open(os.path.join(statedir.get_path(), '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page if not json.loads(text)['crates']: logger.log('last page detected') return True numpage += 1
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: page_counter = count() query = '?per_page={}&sort=alpha'.format(self.per_page) while query: url = self.url + query logger.log('getting ' + url) text = self.do_http(url).text with open(os.path.join(statedir.get_path(), '{}.json'.format(next(page_counter))), 'w', encoding='utf-8') as pagefile: pagefile.write(text) pagefile.flush() os.fsync(pagefile.fileno()) # parse next page query = json.loads(text)['meta']['next_page'] logger.log('last page detected') return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: for letter in ['0-9'] + list(ascii_uppercase): page = 1 numpages = 1 while True: logger.log('fetching {} page {}'.format(letter, page)) pageurl = '{}/{}/page/{}/'.format(self.url, letter, page) # fetch HTML response = self.do_http(pageurl) response.encoding = 'utf-8' # is not detected properly text = response.text # get number of pages, if there are more than 1 of them if numpages == 1: for pagebutton in lxml.html.document_fromstring( text).xpath('.//nav[@class="page-selector"]/a' ): # type: ignore numpages = max(numpages, int(pagebutton.text)) # type: ignore # save HTML with open(os.path.join(statedir.get_path(), '{}-{}.html'.format(letter, page)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) pagefile.flush() os.fsync(pagefile.fileno()) # end if that was last (or only) page if page >= numpages: break # proceed with the next page page += 1 return True
def _load_spec(self, package: str, statedir: AtomicDir, logger: Logger) -> None: specurl = self.giturl + '/{0}.git/plain/{0}.spec'.format(package) logger.get_indented().log('getting spec from {}'.format(specurl)) r = do_http(specurl, check_status=False) if r.status_code != 200: deadurl = self.giturl + '/{0}.git/plain/dead.package'.format( package) dr = do_http(deadurl, check_status=False) if dr.status_code == 200: logger.get_indented(2).log('dead: ' + ';'.join(dr.text.split('\n'))) else: logger.get_indented(2).log('failed: {}'.format( r.status_code)) # XXX: check .dead.package, instead throw return with open(os.path.join(statedir.get_path(), package + '.spec'), 'wb') as file: file.write(r.content)
def _do_fetch_scroll(self, statedir: AtomicDir, logger: Logger) -> None: numpage = 0 logger.log('getting page {}'.format(numpage)) response = self._do_http('{}?scroll={}'.format(self._url, self._scroll), json=self._request_data).json() scroll_id = response['_scroll_id'] while response['hits']['hits']: with open(os.path.join(statedir.get_path(), '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile: json.dump(response['hits']['hits'], pagefile) pagefile.flush() os.fsync(pagefile.fileno()) numpage += 1 logger.log('getting page {}'.format(numpage)) response = self._do_http('{}?scroll={}&scroll_id={}'.format( self._scroll_url, self._scroll, scroll_id)).json() try: self._do_http(self._scroll_url, method='DELETE', json={ 'scroll_id': scroll_id }).json() except requests.exceptions.HTTPError as e: # we don't care too much if removing the scroll fails, it'll timeout anyway # XXX: but log this logger.log('failed to DELETE scroll, server reply follows:\n' + e.response.text, severity=Logger.ERROR) logger.log(e.response.text, severity=Logger.ERROR) pass