예제 #1
0
    def fetch_items(self, path, payload):
        """Return the items from github API using links pagination"""

        page = 0  # current page
        last_page = None  # last page
        url_next = urijoin(self.base_url, 'repos', self.owner, self.repository, path)

        logger.debug("Get GitHub paginated items from " + url_next)

        response = self.fetch(url_next, payload=payload)

        items = response.text
        page += 1

        if 'last' in response.links:
            last_url = response.links['last']['url']
            last_page = last_url.split('&page=')[1].split('&')[0]
            last_page = int(last_page)
            logger.debug("Page: %i/%i" % (page, last_page))

        while items:
            yield items

            items = None

            if 'next' in response.links:
                url_next = response.links['next']['url']
                response = self.fetch(url_next, payload=payload)
                page += 1

                items = response.text
                logger.debug("Page: %i/%i" % (page, last_page))
예제 #2
0
파일: meetup.py 프로젝트: eduranf/perceval
    def events(self, group, from_date=DEFAULT_DATETIME):
        """Fetch the events pages of a given group."""

        date = datetime_to_utc(from_date)
        date = date.strftime("since:%Y-%m-%dT%H:%M:%S.000Z")

        resource = urijoin(group, self.REVENTS)

        # Hack required due to Metup API does not support list
        # values with the format `?param=value1&param=value2`.
        # It only works with `?param=value1,value2`.
        # Morever, urrlib3 encodes comma characters when values
        # are given using params dict, which it doesn't work
        # with Meetup, either.
        fixed_params = '?' + self.PFIELDS + '=' + ','.join(self.VEVENT_FIELDS)
        fixed_params += '&' + self.PSTATUS + '=' + ','.join(self.VSTATUS)
        resource += fixed_params

        params = {
            self.PORDER: self.VUPDATED,
            self.PSCROLL: date,
            self.PPAGE: self.max_items
        }

        try:
            for page in self._fetch(resource, params):
                yield page
        except requests.exceptions.HTTPError as error:
            if error.response.status_code == 410:
                msg = "Group is no longer accessible: {}".format(error)
                raise RepositoryError(cause=msg)
            else:
                raise error
예제 #3
0
    def _fetch(self, resource, params):
        """Fetch a resource.

        Method to fetch and to iterate over the contents of a
        type of resource. The method returns a generator of
        pages for that resource and parameters.

        :param resource: type of the resource
        :param params: parameters to filter

        :returns: a generator of pages for the requeste resource
        """
        url = urijoin(self.base_url, resource)

        params[self.PKEY] = self.api_key
        params[self.PSIGN] = 'true',

        do_fetch = True

        while do_fetch:
            logger.debug("Meetup client calls resource: %s params: %s",
                         resource, str(params))

            self.sleep_for_rate_limit()
            r = self.fetch(url, payload=params)
            self.update_rate_limit(r)

            yield r.text

            if r.links and 'next' in r.links:
                url = r.links['next']['url']
                params = {self.PKEY: self.api_key, self.PSIGN: 'true'}
            else:
                do_fetch = False
예제 #4
0
    def _call(self, resource, params):
        """Retrive the given resource.

        :param resource: resource to retrieve
        :param params: dict with the HTTP parameters needed to retrieve
            the given resource
        """
        url = self.URL % {'base': self.base_url, 'resource': resource}

        logger.debug("Confluence client requests: %s params: %s", resource,
                     str(params))

        while True:
            r = self.fetch(url, payload=params)
            yield r.text

            # Pagination is available when 'next' link exists
            j = r.json()
            if '_links' not in j:
                break
            if 'next' not in j['_links']:
                break

            url = urijoin(self.base_url, j['_links']['next'])
            params = {}
예제 #5
0
    def events(self):
        """Collect the user events"""

        payload = {'per_page': 30}

        path = urijoin("users", self.user, "events", "public")
        return self.fetch_items(path, payload)
예제 #6
0
    def fetch_items(self, category, **kwargs):
        """Fetch the contents

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """

        from_date = kwargs['from_date']

        logger.info("Fetching historical contents of '%s' from %s", self.url,
                    str(from_date))

        nhcs = 0

        contents = self.__fetch_contents_summary(from_date)
        contents = [content for content in contents]

        for content in contents:
            cid = content['id']
            content_url = urijoin(self.origin, content['_links']['webui'])

            hcs = self.__fetch_historical_contents(cid, from_date)

            for hc in hcs:
                hc['content_url'] = content_url
                yield hc
                nhcs += 1

        logger.info("Fetch process completed: %s historical contents fetched",
                    nhcs)
예제 #7
0
    def get_issues(self, from_date):
        """Retrieve all the issues from a given date.

        :param from_date: obtain issues updated since this date
        """
        start_at = 0

        url = urijoin(self.base_url, self.RESOURCE, self.VERSION_API, 'search')
        req = self.fetch(url,
                         payload=self.__build_payload(start_at, from_date))
        issues = req.text

        data = req.json()
        tissues = data['total']
        nissues = data['maxResults']

        start_at += min(nissues, tissues)
        self.__log_status(start_at, tissues)

        while issues:
            yield issues
            issues = None

            if data['startAt'] + nissues < tissues:
                req = self.fetch(url,
                                 payload=self.__build_payload(
                                     start_at, from_date))

                data = req.json()
                start_at += nissues
                issues = req.text
                self.__log_status(start_at, tissues)
예제 #8
0
    def get_fields(self):
        """Retrieve all the fields available."""

        url = urijoin(self.base_url, self.RESOURCE, self.VERSION_API, 'field')
        req = self.fetch(url)

        return req.text
예제 #9
0
    def _parse_archive_links(self, raw_html):
        bs = bs4.BeautifulSoup(raw_html, 'html.parser')

        candidates = [a['href'] for a in bs.find_all('a', href=True)]
        links = []

        for candidate in candidates:
            # Links from Apache's 'mod_mbox' plugin contain
            # trailing "/thread" substrings. Remove them to get
            # the links where mbox files are stored.
            if candidate.endswith(MOD_MBOX_THREAD_STR):
                candidate = candidate[:-len(MOD_MBOX_THREAD_STR)]

            # Ignore links with not recognized extension
            ext1 = os.path.splitext(candidate)[-1]
            ext2 = os.path.splitext(candidate.rstrip(ext1))[-1]

            if ext1 in PIPERMAIL_TYPES or ext2 in PIPERMAIL_TYPES:
                links.append(urijoin(self.url, candidate))
            else:
                logger.debug(
                    "Ignoring %s archive because its extension was not recognized",
                    candidate)

        logger.debug("%s archives found", len(links))

        return links
예제 #10
0
    def summary(self):
        """Get Crates.io summary"""

        path = urijoin(CRATES_API_URL, SUMMARY_CATEGORY)
        raw_content = self.fetch(path)

        return raw_content
예제 #11
0
    def crate_attribute(self, crate_id, attribute):
        """Get crate attribute"""

        path = urijoin(CRATES_API_URL, CRATES_CATEGORY, crate_id, attribute)
        raw_attribute_data = self.fetch(path)

        return raw_attribute_data
예제 #12
0
    def crate(self, crate_id):
        """Get a crate by its ID"""

        path = urijoin(CRATES_API_URL, CRATES_CATEGORY, crate_id)
        raw_crate = self.fetch(path)

        return raw_crate
예제 #13
0
    def crates(self, from_page=1):
        """Get crates in alphabetical order"""

        path = urijoin(CRATES_API_URL, CRATES_CATEGORY)
        raw_crates = self.__fetch_items(path, from_page)

        return raw_crates
예제 #14
0
    def events(self, group, from_date=DEFAULT_DATETIME):
        """Fetch the events pages of a given group."""

        date = datetime_to_utc(from_date)
        date = date.strftime("since:%Y-%m-%dT%H:%M:%S.000Z")

        resource = urijoin(group, self.REVENTS)

        # Hack required due to Metup API does not support list
        # values with the format `?param=value1&param=value2`.
        # It only works with `?param=value1,value2`.
        # Morever, urrlib3 encodes comma characters when values
        # are given using params dict, which it doesn't work
        # with Meetup, either.
        fixed_params = '?' + self.PFIELDS + '=' + ','.join(self.VEVENT_FIELDS)
        fixed_params += '&' + self.PSTATUS + '=' + ','.join(self.VSTATUS)
        resource += fixed_params

        params = {
            self.PORDER: self.VUPDATED,
            self.PSCROLL: date,
            self.PPAGE: self.max_items
        }

        for page in self._fetch(resource, params):
            yield page
예제 #15
0
    def get_jobs(self):
        """ Retrieve all jobs"""

        url_jenkins = urijoin(self.base_url, "api", "json")

        response = self.fetch(url_jenkins)
        return response.text
예제 #16
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the mbox files from the remote archiver.

        This method stores the archives in the path given during the
        initialization of this object.

        HyperKitty archives are accessed month by month and stored following
        the schema year-month. Archives are fetched from the given month
        till the current month.

        :param from_date: fetch archives that store messages
            equal or after the given date; only year and month values
            are compared

        :returns: a list of tuples, storing the links and paths of the
            fetched archives
        """
        logger.info("Downloading mboxes from '%s' to since %s", self.url,
                    str(from_date))
        logger.debug("Storing mboxes in '%s'", self.dirpath)

        # Check mailing list URL
        r = requests.get(self.url)
        r.raise_for_status()

        from_date = datetime_to_utc(from_date)
        to_end = datetime_utcnow()
        to_end += dateutil.relativedelta.relativedelta(months=1)

        months = months_range(from_date, to_end)

        fetched = []

        if not os.path.exists(self.dirpath):
            os.makedirs(self.dirpath)

        tmbox = 0

        for dts in months:
            tmbox += 1
            start, end = dts[0], dts[1]
            filename = start.strftime("%Y-%m.mbox.gz")
            filepath = os.path.join(self.dirpath, filename)

            url = urijoin(self.url, 'export', filename)

            params = {
                'start': start.strftime("%Y-%m-%d"),
                'end': end.strftime("%Y-%m-%d")
            }

            success = self._download_archive(url, params, filepath)

            if success:
                fetched.append((url, filepath))

        logger.info("%s/%s MBoxes downloaded", len(fetched), tmbox)

        return fetched
예제 #17
0
    def issue_notes(self, issue_id):
        """Get the issue notes from pagination"""

        payload = {'order_by': 'updated_at', 'sort': 'asc'}

        path = urijoin("issues", str(issue_id), "notes")

        return self.fetch_items(path, payload)
예제 #18
0
    def crate_attribute(self, crate_id, attribute):
        """Get crate attribute"""

        path = urijoin(CRATES_API_URL, CRATES_CATEGORY, crate_id, attribute)
        raw_attribute_data = self.__send_request(path,
                                                 headers=self.__set_headers())

        return raw_attribute_data
예제 #19
0
    def get_jobs(self):
        """ Retrieve all jobs
        """
        url_jenkins = urijoin(self.url, "/api/json")

        req = requests.get(url_jenkins)
        req.raise_for_status()
        return req.text
예제 #20
0
    def issue(self, issue_id):
        """Get the issue data by its ID"""

        path = urijoin("bugs", str(issue_id))
        url_issue = self.__get_url(path)
        raw_text = self.__send_request(url_issue)

        return raw_text
예제 #21
0
    def issue_emojis(self, issue_id):
        """Get emojis of an issue"""

        payload = {'order_by': 'updated_at', 'sort': 'asc'}

        path = urijoin("issues", str(issue_id), "award_emoji")

        return self.fetch_items(path, payload)
예제 #22
0
    def __init__(self, bot, bot_token, tag=None, cache=None, archive=None):
        origin = urijoin(TELEGRAM_URL, bot)

        super().__init__(origin, tag=tag, cache=cache, archive=archive)
        self.bot = bot
        self.bot_token = bot_token

        self.client = None
예제 #23
0
    def pull_commits(self, pr_number):
        """Get pull request commits"""

        payload = {
            'per_page': 30,
        }

        commit_url = urijoin("pulls", str(pr_number), "commits")
        return self.fetch_items(commit_url, payload)
예제 #24
0
    def comments(self, group, event_id):
        """Fetch the comments of a given event."""

        resource = urijoin(group, self.REVENTS, event_id, self.RCOMMENTS)

        params = {self.PPAGE: self.max_items}

        for page in self._fetch(resource, params):
            yield page
예제 #25
0
    def __init__(self, channel, api_token, max_items=MAX_ITEMS,
                 tag=None, cache=None):
        origin = urijoin(SLACK_URL, channel)

        super().__init__(origin, tag=tag, cache=cache)
        self.channel = channel
        self.max_items = max_items
        self.client = SlackClient(api_token, max_items=max_items)
        self._users = {}
예제 #26
0
    def issue_collection(self, issue_id, collection_name):
        """Get a collection list of a given issue"""

        path = urijoin("bugs", str(issue_id), collection_name)
        url_collection = self.__get_url(path)
        payload = {'ws.size': self.items_per_page, 'ws.start': 0, 'order_by': 'date_last_updated'}

        raw_items = self.__fetch_items(path=url_collection, payload=payload)

        return raw_items
예제 #27
0
    def __init__(self, owner, repository, tag=None, cache=None):
        if owner == DOCKER_SHORTCUT_OWNER:
            owner = DOCKER_OWNER

        origin = urijoin(DOCKERHUB_URL, owner, repository)

        super().__init__(origin, tag=tag, cache=cache)
        self.owner = owner
        self.repository = repository
        self.client = DockerHubClient()
예제 #28
0
    def get_html_question(self, question_id, page=1):
        """Retrieve a raw HTML question and all it's information.

        :param question_id: question identifier
        :param page: page to retrieve
        """
        path = urijoin(self.HTML_QUESTION, question_id)
        params = {'page': page, 'sort': self.ORDER_HTML}
        response = self.__call(path, params)
        return response
예제 #29
0
    def repository(self, owner, repository):
        """Fetch information about a repository."""

        url = urijoin(self.base_url, self.RREPOSITORY, owner, repository)

        logger.debug("DockerHub client requests: %s", url)

        response = self.fetch(url)

        return response.text
예제 #30
0
    def __init__(self, owner, repository, tag=None, archive=None):
        if owner == DOCKER_SHORTCUT_OWNER:
            owner = DOCKER_OWNER

        origin = urijoin(DOCKERHUB_URL, owner, repository)

        super().__init__(origin, tag=tag, archive=archive)
        self.owner = owner
        self.repository = repository
        self.client = None