Exemplo n.º 1
0
    def do_fetch(self, statefile, logger):
        # Get and parse repomd.xml
        repomd_url = self.url + 'repodata/repomd.xml'
        logger.Log('fetching metadata from ' + repomd_url)
        repomd_content = fetch(repomd_url, check_status=True).text
        repomd_xml = xml.etree.ElementTree.fromstring(repomd_content)

        repodata_url = self.url + repomd_xml.find(
            '{http://linux.duke.edu/metadata/repo}data[@type="primary"]/{http://linux.duke.edu/metadata/repo}location'
        ).attrib['href']

        logger.Log('fetching ' + repodata_url)
        data = fetch(repodata_url).content

        logger.GetIndented().Log('size is {} byte(s)'.format(len(data)))

        if repodata_url.endswith('gz'):
            logger.GetIndented().Log('decompressing with gzip')
            data = gzip.decompress(data)
        elif repodata_url.endswith('xz'):
            logger.GetIndented().Log('decompressing with xz')
            data = lzma.LZMADecompressor().decompress(data)

        logger.GetIndented().Log(
            'size after decompression is {} byte(s)'.format(len(data)))

        logger.GetIndented().Log('saving')

        statefile.write(data)
Exemplo n.º 2
0
    def do_fetch(self, statedir, logger):
        for letter in ['0-9'] + [l for l in ascii_uppercase]:
            page = 1
            numpages = 1
            while True:
                logger.Log('fetching {} page {}'.format(letter, page))

                pageurl = '{}/{}/page/{}/'.format(self.url, letter, page)

                # fetch HTML
                response = fetch(pageurl)
                response.encoding = 'utf-8'  # is not detected properly
                text = response.text

                # get number of pages, if there are more than 1 of them
                if numpages == 1:
                    for pagebutton in lxml.html.document_fromstring(
                            text).xpath('.//nav[@class="page-selector"]/a'):
                        numpages = max(numpages, int(pagebutton.text))

                # save HTML
                with open(os.path.join(statedir,
                                       '{}-{}.html'.format(letter, page)),
                          'w',
                          encoding='utf-8') as pagefile:
                    pagefile.write(text)

                # end if that was last (or only) page
                if page >= numpages:
                    break

                # proceed with the next page
                page += 1
Exemplo n.º 3
0
    def do_fetch(self, statefile, logger):
        fetching_what = [self.url]
        if isinstance(self.post, dict):
            fetching_what.append('{} fields of form data'.format(len(self.post)))
        elif self.post:
            fetching_what.append('{} bytes of post data'.format(len(self.post)))

        if self.headers:
            fetching_what.append('{} extra headers'.format(len(self.headers)))

        logger.Log('fetching ' + ', with '.join(fetching_what))

        data = fetch(self.url, post=self.post, headers=self.headers).content

        logger.GetIndented().Log('size is {} byte(s)'.format(len(data)))

        if self.compression == 'gz':
            logger.GetIndented().Log('decompressing with gzip')
            data = gzip.decompress(data)
        elif self.compression == 'bz2':
            logger.GetIndented().Log('decompressing with bz2')
            data = bz2.decompress(data)
        elif self.compression == 'xz':
            logger.GetIndented().Log('decompressing with xz')
            data = lzma.LZMADecompressor().decompress(data)

        if self.compression:
            logger.GetIndented().Log('size after decompression is {} byte(s)'.format(len(data)))

        logger.GetIndented().Log('saving')

        statefile.write(data)
Exemplo n.º 4
0
    def _load_spec(self, package, statedir, logger):
        specurl = self.giturl + '/{0}.git/plain/{0}.spec'.format(package)

        logger.GetIndented().Log('getting spec from {}'.format(specurl))

        r = fetch(specurl, check_status=False)
        if r.status_code != 200:
            deadurl = self.giturl + '/{0}.git/plain/dead.package'.format(
                package)
            dr = fetch(deadurl, check_status=False)
            if dr.status_code == 200:
                logger.GetIndented(2).Log('dead: ' +
                                          ';'.join(dr.text.split('\n')))
            else:
                logger.GetIndented(2).Log('failed: {}'.format(
                    r.status_code))  # XXX: check .dead.package, instead throw
            return

        with open(os.path.join(statedir, package + '.spec'), 'wb') as file:
            file.write(r.content)
Exemplo n.º 5
0
    def do_fetch(self, statedir, logger):
        packages_url = self.url + 'packages.gz'
        logger.GetIndented().Log('fetching package list from ' + packages_url)
        data = fetch(packages_url).text  # autogunzipped?

        package_names = []

        for line in data.split('\n'):
            line = line.strip()
            if line.startswith('#') or line == '':
                continue
            package_names.append(line)

        if not package_names:
            raise RuntimeError(
                'Empty package list received, refusing to continue')

        logger.GetIndented().Log('{} package name(s) parsed'.format(
            len(package_names)))

        pagesize = 100

        for page in range(0, len(package_names) // pagesize + 1):
            ifrom = page * pagesize
            ito = (page + 1) * pagesize
            url = '&'.join([
                'arg[]=' + urllib.parse.quote(name)
                for name in package_names[ifrom:ito]
            ])
            url = self.url + '/rpc/?v=5&type=info&' + url

            logger.GetIndented().Log('fetching page {}/{}'.format(
                page + 1,
                len(package_names) // pagesize + 1))

            with open(os.path.join(statedir, '{}.json'.format(page)),
                      'wb') as statefile:
                statefile.write(fetch(url, timeout=self.fetch_timeout).content)

            if self.fetch_delay:
                time.sleep(self.fetch_delay)
Exemplo n.º 6
0
    def do_fetch(self, statedir, logger):
        page = 1

        while True:
            pageurl = self.apiurl + 'packages/?page={}'.format(page)
            logger.Log('getting page {} from {}'.format(page, pageurl))
            pagedata = json.loads(fetch(pageurl).text)

            for package in pagedata['packages']:
                self._load_spec(package['name'], statedir, logger)

            page += 1

            if page > pagedata['page_total']:
                break
Exemplo n.º 7
0
    def fetch(self, statepath, update=True, logger=NoopLogger()):
        if os.path.isfile(statepath) and not update:
            logger.Log('no update requested, skipping')
            return

        state = {}

        if os.path.isfile(statepath):
            with open(statepath, 'r', encoding='utf-8') as oldstatefile:
                state = json.load(oldstatefile)
            logger.Log('loaded old state, {} entries'.format(len(state)))
        else:
            logger.Log('starting with empty state')

        newdata = json.loads(fetch(self.url).text)

        if not newdata['releases']:
            raise RuntimeError(
                'Empty freshcode package list received, refusing to go on')

        # add new entries in reversed order, oldest first so newest
        # have higher priority; may also compare versions here
        for entry in newdata['releases']:
            if 'name' not in entry:
                logger.Log('skipping entry with no name')
                continue

            if entry['name'] in state:
                oldentry = state[entry['name']]

                if version_compare(entry['version'], oldentry['version']) > 0:
                    logger.Log(
                        'replacing entry "{}", version changed {} -> {}'.
                        format(entry['name'], oldentry['version'],
                               entry['version']))
                    state[entry['name']] = entry
            else:
                logger.Log('adding entry "{}", version {}'.format(
                    entry['name'], entry['version']))
                state[entry['name']] = entry

        with state_file(statepath, 'w', encoding='utf-8') as statefile:
            json.dump(state, statefile)

        logger.Log('saved new state, {} entries'.format(len(state)))
Exemplo n.º 8
0
    def do_fetch(self, statedir, logger):
        numpage = 1
        while True:
            url = self.url + '?page={}&per_page={}&sort=alpha'.format(
                numpage, self.per_page)
            logger.Log('getting ' + url)

            text = fetch(url, timeout=self.fetch_timeout).text
            with open(os.path.join(statedir, '{}.json'.format(numpage)),
                      'w',
                      encoding='utf-8') as pagefile:
                pagefile.write(text)

            # parse next page
            if not json.loads(text)['crates']:
                logger.Log('last page detected')
                return

            numpage += 1

            if self.fetch_delay:
                time.sleep(self.fetch_delay)
Exemplo n.º 9
0
    def do_fetch(self, statedir, logger):
        numpage = 0
        nextpageurl = self.url + 'Packages()?$filter=IsLatestVersion'
        while True:
            logger.Log('getting ' + nextpageurl)

            text = fetch(nextpageurl, timeout=self.fetch_timeout).text
            with open(os.path.join(statedir, '{}.xml'.format(numpage)),
                      'w',
                      encoding='utf-8') as pagefile:
                pagefile.write(text)

            # parse next page
            logger.Log('parsing ' + nextpageurl)
            root = xml.etree.ElementTree.fromstring(text)

            next_link = root.find(
                '{http://www.w3.org/2005/Atom}link[@rel="next"]')
            if next_link is None:
                break

            nextpageurl = next_link.attrib['href']
            numpage += 1