Пример #1
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        numpage = 0
        nextpageurl = self.url + 'Packages()?$filter=IsLatestVersion'
        while True:
            logger.log('getting ' + nextpageurl)

            text = self.do_http(nextpageurl).text
            with open(os.path.join(statedir.get_path(),
                                   '{}.xml'.format(numpage)),
                      'w',
                      encoding='utf-8') as pagefile:
                pagefile.write(text)

            # parse next page
            logger.log('parsing ' + nextpageurl)
            root = xml.etree.ElementTree.fromstring(text)

            next_link = root.find(
                '{http://www.w3.org/2005/Atom}link[@rel="next"]')
            if next_link is None:
                break

            nextpageurl = next_link.attrib['href']
            numpage += 1

        return True
Пример #2
0
    def fetch(self,
              statepath: str,
              update: bool = True,
              logger: Logger = NoopLogger()) -> bool:
        if os.path.isdir(statepath) and not update:
            logger.log('no update requested, skipping')
            return False

        persdata: Dict[str, Any] = {}

        perspath = statepath + '.persdata'

        try:
            with open(perspath, 'rb') as rpersfile:
                persdata = pickle.load(rpersfile)
        except (EOFError, FileNotFoundError, pickle.UnpicklingError):
            pass

        with AtomicDir(statepath) as statedir:
            have_changes = self._do_fetch(statedir, persdata, logger)

            if persdata:
                with AtomicFile(perspath, 'wb') as wpersfile:
                    pickle.dump(persdata, wpersfile.get_file())
                    wpersfile.get_file().flush()
                    os.fsync(wpersfile.get_file().fileno())

            if not have_changes:
                statedir.cancel()

            return have_changes
Пример #3
0
    def fetch(self,
              statepath: str,
              update: bool = True,
              logger: Logger = NoopLogger()) -> bool:
        if os.path.isdir(statepath) and not update:
            logger.log('no update requested, skipping')
            return False

        persdata: Dict[str, Any] = {}

        perspath = statepath + '.persdata'

        if os.path.exists(perspath):
            with open(perspath, 'rb') as rpersfile:
                persdata = pickle.load(rpersfile)

        with AtomicDir(statepath) as statedir:
            have_changes = self._do_fetch(statedir, persdata, logger)

            if persdata:
                with AtomicFile(perspath, 'wb') as wpersfile:
                    pickle.dump(persdata, wpersfile.get_file())

            if not have_changes:
                statedir.cancel()

            return have_changes
Пример #4
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool:
        packages_url = self.url + 'packages.gz'
        logger.get_indented().log('fetching package list from ' + packages_url)
        data = self.do_http(packages_url).text  # autogunzipped?

        package_names = []

        for line in data.split('\n'):
            line = line.strip()
            if line.startswith('#') or line == '':
                continue
            package_names.append(line)

        if not package_names:
            raise RuntimeError('Empty package list received, refusing to continue')

        logger.get_indented().log('{} package name(s) parsed'.format(len(package_names)))

        for num_page, (url, num_packages) in enumerate(_split_names_into_urls(self.url + '/rpc/?v=5&type=info', package_names, self.max_api_url_length)):
            logger.get_indented().log('fetching page {} of {} package(s)'.format(num_page + 1, num_packages))

            with open(os.path.join(statedir.get_path(), '{}.json'.format(num_page)), 'wb') as statefile:
                statefile.write(self.do_http(url).content)

        return True
Пример #5
0
 def fetch(self,
           statepath: str,
           update: bool = True,
           logger: Logger = NoopLogger()) -> bool:
     if not os.path.isdir(statepath):
         with AtomicDir(statepath) as statedir:
             return self._do_fetch(statedir.get_path(), logger)
     elif update:
         return self._do_update(statepath, logger)
     else:
         logger.log('no update requested, skipping')
         return False
Пример #6
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        tarpath = os.path.join(statedir.get_path(), '.temporary.tar')

        headers = {}

        if persdata.get('last-modified'):
            headers['if-modified-since'] = persdata.get('last-modified')
            logger.log('using if-modified-since: {}'.format(
                headers['if-modified-since']))

        logger.log('fetching {}'.format(self.url))

        try:
            with open(tarpath, 'wb') as tarfile:
                response = save_http_stream(self.url,
                                            tarfile,
                                            headers=headers,
                                            timeout=self.fetch_timeout)
        except NotModifiedException:
            logger.log('got 403 not modified')
            return False

        # XXX: may be unportable, FreeBSD tar automatically handles compression type,
        # may not be the case on linuxes
        # XXX: this extracts tarball permissions, which is not desirable and it may
        # produce non-readable files and dirs (blackarch). GNU tar has --mode, BSD tar
        # lacks this. We should probably require GNU tar, and handle binary name which
        # may differ on BSD.
        run_subprocess(
            ['tar', '-x', '-z', '-f', tarpath, '-C',
             statedir.get_path()], logger)
        os.remove(tarpath)

        if response.headers.get('last-modified'):
            persdata['last-modified'] = response.headers['last-modified']
            logger.log('storing last-modified: {}'.format(
                persdata['last-modified']))

        return True
Пример #7
0
    def _parse(self, repository: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> None:
        logger.log('parsing started')

        if not os.path.isdir(self.parseddir):
            os.mkdir(self.parseddir)

        with AtomicDir(self._get_parsed_path(repository)) as state_dir:
            serializer = ChunkedSerializer(state_dir.get_path(), MAX_PACKAGES_PER_CHUNK)

            serializer.serialize(self._iter_parse_all_sources(repository, transformer, logger))

            if self.safety_checks and serializer.get_num_packages() < repository['minpackages']:
                raise TooLittlePackages(serializer.get_num_packages(), repository['minpackages'])

        logger.log('parsing complete, {} packages'.format(serializer.get_num_packages()))
Пример #8
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        tarpath = os.path.join(statedir.get_path(), '.temporary.tar')

        headers = {}

        if persdata.get('last-modified'):
            headers['if-modified-since'] = persdata.get('last-modified')
            logger.log('using if-modified-since: {}'.format(
                headers['if-modified-since']))

        logger.log('fetching {}'.format(self.url))

        try:
            with open(tarpath, 'wb') as tarfile:
                response = save_http_stream(self.url,
                                            tarfile,
                                            headers=headers,
                                            timeout=self.fetch_timeout)
        except NotModifiedException:
            logger.log('got 403 not modified')
            return False

        # XXX: may be unportable, FreeBSD tar automatically handles compression type,
        # may not be the case on linuxes
        run_subprocess(
            ['tar', '-x', '-z', '-f', tarpath, '-C',
             statedir.get_path()], logger)
        os.remove(tarpath)

        if response.headers.get('last-modified'):
            persdata['last-modified'] = response.headers['last-modified']
            logger.log('storing last-modified: {}'.format(
                persdata['last-modified']))

        return True
Пример #9
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool:
        numpage = 1
        while True:
            url = self.url + '?page={}&per_page={}&sort=alpha'.format(numpage, self.per_page)
            logger.log('getting ' + url)

            text = self.do_http(url).text
            with open(os.path.join(statedir.get_path(), '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile:
                pagefile.write(text)

            # parse next page
            if not json.loads(text)['crates']:
                logger.log('last page detected')
                return True

            numpage += 1
Пример #10
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool:
        page_counter = count()
        query = '?per_page={}&sort=alpha'.format(self.per_page)
        while query:
            url = self.url + query
            logger.log('getting ' + url)

            text = self.do_http(url).text
            with open(os.path.join(statedir.get_path(), '{}.json'.format(next(page_counter))), 'w', encoding='utf-8') as pagefile:
                pagefile.write(text)
                pagefile.flush()
                os.fsync(pagefile.fileno())

            # parse next page
            query = json.loads(text)['meta']['next_page']

        logger.log('last page detected')
        return True
Пример #11
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        for letter in ['0-9'] + list(ascii_uppercase):
            page = 1
            numpages = 1
            while True:
                logger.log('fetching {} page {}'.format(letter, page))

                pageurl = '{}/{}/page/{}/'.format(self.url, letter, page)

                # fetch HTML
                response = self.do_http(pageurl)
                response.encoding = 'utf-8'  # is not detected properly
                text = response.text

                # get number of pages, if there are more than 1 of them
                if numpages == 1:
                    for pagebutton in lxml.html.document_fromstring(
                            text).xpath('.//nav[@class="page-selector"]/a'
                                        ):  # type: ignore
                        numpages = max(numpages,
                                       int(pagebutton.text))  # type: ignore

                # save HTML
                with open(os.path.join(statedir.get_path(),
                                       '{}-{}.html'.format(letter, page)),
                          'w',
                          encoding='utf-8') as pagefile:
                    pagefile.write(text)
                    pagefile.flush()
                    os.fsync(pagefile.fileno())

                # end if that was last (or only) page
                if page >= numpages:
                    break

                # proceed with the next page
                page += 1

        return True
Пример #12
0
    def _load_spec(self, package: str, statedir: AtomicDir,
                   logger: Logger) -> None:
        specurl = self.giturl + '/{0}.git/plain/{0}.spec'.format(package)

        logger.get_indented().log('getting spec from {}'.format(specurl))

        r = do_http(specurl, check_status=False)
        if r.status_code != 200:
            deadurl = self.giturl + '/{0}.git/plain/dead.package'.format(
                package)
            dr = do_http(deadurl, check_status=False)
            if dr.status_code == 200:
                logger.get_indented(2).log('dead: ' +
                                           ';'.join(dr.text.split('\n')))
            else:
                logger.get_indented(2).log('failed: {}'.format(
                    r.status_code))  # XXX: check .dead.package, instead throw
            return

        with open(os.path.join(statedir.get_path(), package + '.spec'),
                  'wb') as file:
            file.write(r.content)
Пример #13
0
    def _do_fetch_scroll(self, statedir: AtomicDir, logger: Logger) -> None:
        numpage = 0

        logger.log('getting page {}'.format(numpage))
        response = self._do_http('{}?scroll={}'.format(self._url,
                                                       self._scroll),
                                 json=self._request_data).json()

        scroll_id = response['_scroll_id']

        while response['hits']['hits']:
            with open(os.path.join(statedir.get_path(),
                                   '{}.json'.format(numpage)),
                      'w',
                      encoding='utf-8') as pagefile:
                json.dump(response['hits']['hits'], pagefile)
                pagefile.flush()
                os.fsync(pagefile.fileno())

            numpage += 1

            logger.log('getting page {}'.format(numpage))
            response = self._do_http('{}?scroll={}&scroll_id={}'.format(
                self._scroll_url, self._scroll, scroll_id)).json()

        try:
            self._do_http(self._scroll_url,
                          method='DELETE',
                          json={
                              'scroll_id': scroll_id
                          }).json()
        except requests.exceptions.HTTPError as e:
            # we don't care too much if removing the scroll fails, it'll timeout anyway
            # XXX: but log this
            logger.log('failed to DELETE scroll, server reply follows:\n' +
                       e.response.text,
                       severity=Logger.ERROR)
            logger.log(e.response.text, severity=Logger.ERROR)
            pass