def get_repositories_info(query, number_of_repos): """ Sends search request by GitHub REST API. Processes respond and returns tuple of urls. Each url is leading to one repository. :param query: query string :param number_of_repos: max number of repositories to be downloaded :return: list of Repository objects :raise: ConnectionError if network problem :raise: TimeoutError if no response Usage: >>> r = RepositoryDownloader() >>> query = RESTUtils.create_search_query(searched_phrase='tetris', language='assembly') >>> r.get_repositories_info(query=query) """ # query_dict = {'q': query, 'sort': 'stars', 'order': 'desc'} assert (number_of_repos > 0) results_per_page = number_of_repos if number_of_repos > 30 else 30 # by default, we have 30 results per page max_page = 1000 // results_per_page # From GitHub API: "Only the first 1000 search results are available" query_dict = {'q': query, 'per_page': results_per_page} try: # first request is only to get total count of results req = RESTUtils.make_get_request(url='https://api.github.com/search/repositories', params=query_dict) # generate random page # number of pages. We omit last. It may be incomplete # 0 page is equal to 1 page pages = req.json()['total_count'] // results_per_page pages = max_page if pages > max_page else pages page = random.randint(1, pages) query_dict['page'] = page # add random page to query req = RESTUtils.make_get_request(url='https://api.github.com/search/repositories', params=query_dict) logging.info('Searching random repositories. ' 'Reguest {0}'.format(req.url) + 'Page {0}, count {1}'.format(page, number_of_repos)) except: raise items = req.json()['items'] number_of_repos = len(items) if number_of_repos > len(items) else number_of_repos items = random.sample(items, number_of_repos) # random elements from items return [Repository.from_json(item) for item in items]
def save_file(self, file_json): """ Downloads and saves file. Creates directory for file if needed. :param file_json: file information in JSON :return: :raise: ConnectionError if network problem :raise: TimeoutError if no response """ filepath = os.path.join(self._downloadDirectoryPath, self._repositoryDirectoryName, file_json['path']) os.makedirs(os.path.dirname(filepath), exist_ok=True) # create dir if needed try: with open(filepath, "wb") as out_file: req = RESTUtils.make_get_request(file_json['download_url']) out_file.write(req.content) logging.info('Saved file: ' + filepath) print(filepath) except TimeoutError as err: logging.warning('Timeout while downloading file {0}'.format(filepath) + 'Error message: {0}'.format(str(err))) try: os.remove(filepath) logging.warning('Deleting file {0}'.format(filepath)) except FileNotFoundError: pass print('TimeoutError. {0} is skipped'.format(filepath)) except ConnectionError as err: logging.error('ConnectionError while downloading file {0}'.format(filepath) + 'Error message: {0}'.format(str(err))) raise
def download_contents_from_url(self, url, number_of_files): """ Downloads content from url. :param url: url to download from :param number_of_files: max number of files to be downloaded :return: number of downloaded files :raise: ConnectionError if network problem :raise: TimeoutError if no response """ try: req = RESTUtils.make_get_request(url) except (ConnectionError, TimeoutError) as err: logging.error('Downloading content by REST request failed. ' 'Error msg {0}'.format(str(err))) raise return self.iterate_through_files_and_save_them(req.json(), number_of_files)