def __init__(self, api_domain, api_key=None, rate_limit=API_LIMIT): """ Instantiates a new CallHub instance >>> callhub = CallHub("https://api-na1.callhub.io") With built-in rate limiting disabled: >>> callhub = CallHub(rate_limit=False) Args: api_domain (``str``): Domain to access API (eg: api.callhub.io, api-na1.callhub.io), this varies by account Keyword Args: api_key (``str``, optional): Optional API key. If not provided, it will attempt to use ``os.environ['CALLHUB_API_KEY']`` rate_limit (``dict``, optional): Enabled by default with settings that respect callhub's API limits. Setting this to false disables ratelimiting, or you can set your own limits by following the example below. Please don't abuse! :) >>> callhub = CallHub(rate_limit={"GENERAL": {"calls": 13, "period": 1}, >>> "BULK_CREATE": {"calls": 1, "period": 70}}) - Default limits bulk_create to 1 per 70 seconds (CallHub states their limit is every 60s but in practice a delay of 60s exactly can trip their rate limiter anyways) - Default limits all other API requests to 13 per second (CallHub support states their limit is 20/s but this plays it on the safe side, because other rate limiters seem a little sensitive) """ self.session = FuturesSession(max_workers=43) # Attempt 3 retries for failed connections adapter = requests.adapters.HTTPAdapter(max_retries=3) self.session.mount('https://', adapter) self.session.mount('http://', adapter) # Truncate final '/' off of API domain if it was provided if api_domain[-1] == "/": self.api_domain = api_domain[:-1] else: self.api_domain = api_domain if rate_limit: # Apply general rate limit to self.session.get rate_limited_get = sleep_and_retry(limits(**rate_limit["GENERAL"])(FuturesSession.get)) self.session.get = types.MethodType(rate_limited_get, self.session) # Apply general rate limit to self.session.post rate_limited_post = sleep_and_retry(limits(**rate_limit["GENERAL"])(FuturesSession.post)) self.session.post = types.MethodType(rate_limited_post, self.session) # Apply bulk rate limit to self.bulk_create self.bulk_create = sleep_and_retry(limits(**rate_limit["BULK_CREATE"])(self.bulk_create)) self.session.auth = CallHubAuth(api_key=api_key) # validate_api_key returns administrator email on success self.admin_email = self.validate_api_key() # cache for do-not-contact number/list to id mapping self.dnc_cache = {}
def add_rate_limiting(self, f: Callable): if self.ratelimit_params: g = limits(**self.ratelimit_params)(f) else: g = limits(calls=self.ratelimit_calls_per_min, period=60)(f) g = sleep_and_retry(g) g = on_exception( expo, (RateLimitException, HTTPError), max_time=self.backoff_timeout_seconds, factor=4, )(g) return g
def __init__(self, search_string='', max_number_of_requests=30, rate_limit_timeout_period=60, proxies=None): """ Usage Examples ---------- >>> synonym = Synonyms('mother') >>> results = synonym.find_synonyms() >>> synonym = Synonyms(search_string='mother') >>> results = synonym.find_synonyms() Parameters ---------- :param search_string: string containing the variable to obtain synonyms for :param max_number_of_requests: maximum number of requests for a specific timeout_period :param rate_limit_timeout_period: the time period before a session is placed in a temporary hibernation mode :param proxies: dictionary of proxies to use with Python Requests """ self._word = search_string self._proxies = proxies ratelimit_status = False self._rate_limit_status = ratelimit_status # Retries the requests after a certain time period has elapsed handler = on_exception(expo, RateLimitException, max_time=60, on_backoff=self._backoff_handler) # Establishes a rate limit for making requests to the synonyms repositories limiter = limits(calls=max_number_of_requests, period=rate_limit_timeout_period) self.find_synonyms = handler(limiter(self.find_synonyms))
def __init__(self, client_key, client_secret, diskcache = None): """Initialize the client""" self.client_key = client_key self.client_secret = client_secret throttled_get = ratelimit.sleep_and_retry(ratelimit.limits(calls=1, period=1)(requests.get)) self.requests_get = throttled_get if diskcache is None else ( diskcache.memoize()(throttled_get))
def __init__(self, search_string='', output_format='list', max_number_of_requests=30, rate_limit_timeout_period=60, user_agent=None, proxies=None): """ Purpose ---------- This Python class is used to query multiple online repositories for the definition associated with a specific word. Usage Examples ---------- >>> definition = Definitions('mother') >>> results = definition.find_definitions() >>> definition = Definitions(search_string='mother') >>> results = definition.find_definitions() Parameters ---------- :param search_string: string containing the variable to obtain definition for :param output_format: Format to use for returned results. Default value: list; Acceptable values: dictionary or list :param max_number_of_requests: maximum number of requests for a specific timeout_period :param rate_limit_timeout_period: the time period before a session is placed in a temporary hibernation mode :param user_agent: string containing either a global user agent type or a specific user agent :param proxies: dictionary of proxies to use with Python Requests """ self._proxies = proxies self._word = search_string self._user_agent = user_agent self._output_format = output_format rate_limit_status = False self._rate_limit_status = rate_limit_status # Retries the requests after a certain time period has elapsed handler = on_exception(expo, RateLimitException, max_time=60, on_backoff=self._backoff_handler) # Establishes a rate limit for making requests to the antonyms repositories limiter = limits(calls=max_number_of_requests, period=rate_limit_timeout_period) self.find_definitions = handler(limiter(self.find_definitions))
def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True, err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False): if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() if not overwrite_warc: # Find out next nonexisting warc filename num = 0 while os.path.exists(filename): filename2, ext = os.path.splitext(filename) # Should be filename.warc.gz if ext == '.gz' and filename2.endswith('.warc'): filename2, ext2 = os.path.splitext(filename2) # Should be filename.warc ext = ext2 + ext # Should be .warc.gz filename = '{0}-{1:05d}{2}'.format(filename2, num, ext) num += 1 logger_.log('INFO', 'Creating archivefile: {0}'.format(filename)) self._output_file = open(filename, 'wb') self._logger_ = logger_ self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent} self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal self._writer = WARCWriter(self._output_file, gzip=True) if warcinfo_record_data is None: # INFO RECORD # Some custom information about the warc writer program and its settings info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.0', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'} info_record = self._writer.create_warcinfo_record(filename, info_headers) else: # Must recreate custom headers else they will not be copied custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\ encode('UTF-8') info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0], payload=BytesIO(custom_headers), length=len(custom_headers)) self._writer.write_record(info_record)
def __init__(self, base_url, client_id, client_secret, api_json=None, token_expires_percent=5): # type: (str, str, str, str, str) -> None """[Init method used to create an Api Class for making api calls] :param base_url: Base URL of the API service :type base_url: str :param client_id: Client ID of the application :type client_id: str :param client_secret: Secret of the application :type client_secret: str :param api_json: API file defining all JSON limits and calls, defaults to None because it will use the default :type api_json: str, optional :param token_expires_percent: This is a percentage of time to take off the token renewal to ensure it doesn't run out. For instance 5(%) of 3600 is 180. :type token_expires_percent: int, optional :raises Exception: If this cannot be configured with parameters used :raises AttributeError: If the apis.json file is empty """ self.base_url = base_url self.client_id = client_id self.client_secret = client_secret self.token_expires_percent = int(100 - token_expires_percent) / 100 # This line is needed so pylint doesn't complain about this variable not existing. self.__log = self.__log # If the user doesn't pass an alternate API file use the included one if not api_json: api_json = pkg_resources.resource_filename(__name__, 'apis.json') with open(api_json, encoding='utf-8') as api_file: apis = json.loads(api_file.read()) # If the string is empty if not apis: raise AttributeError("fFile {api_file} loaded is empty") # Create a dict to hold details of scopes (from json) self.scopes = defaultdict(dict) # Create a dict to cache the tokens self.tokens = defaultdict(dict) # Setup all of the calls to the apis with the limits for (client_scope, api) in apis.items(): self.scopes[client_scope]["api_call"] = sleep_and_retry( limits(calls=api.get('limits_calls'), period=api.get('limits_period'))(self._api_call)) # Store the token url associated with this client scope for later self.scopes[client_scope]["token_url"] = api.get('token_url')
def auth(self, session: Session, username: str, password: str, calls: Optional[int]=None, period: Optional[int]=None): """Create session manager and optionally use rate limit""" self._manager = sessionManager( keystoneURL=self.keystoneURL, username=username, password=password, headers={ "Fiware-Service": self.service, "Fiware-ServicePath": self.subservice, "X-Auth-Token": "missing", # Otherwise we get error 400 missing auth token } ) if calls is not None and period is not None: self._manager = sleep_and_retry(limits(calls=calls, period=period)(self._manager))
def construct_ratelimit_rows(row_generator_fx, max_rows_per_minute, blocking=True): ''' Case 1: - set a rate limit (min 1 per second) - option to make function blocking row_generator_fx releases 1 row per call (return, not yield) ''' row_generator_fx = limits(calls=max_rows_per_minute, period=ONE_SEC)(row_generator_fx) if blocking: row_generator_fx = sleep_and_retry(row_generator_fx) return row_generator_fx
def __init__(self, url, timeout=60, requests_per_min=100000, retries=10, page_size=0, supports_bundled_mode=True, persistence_file_path="rate_limits.db", agent=__agent__): """Configuration of a SPARQL Endpoint. Args: url (str): URL of the endpoint. timeout (int, optional): Defines the time which the endpoint is given to respond (in seconds). Defaults to 60. requests_per_min (int, optional): Defines the maximal number of requests per minute. Defaults to 100000. retries (int, optional): Defines the number of times a query is retried. Defaults to 10. page_size (int, optional): Limits the page size of the results, since many endpoints have limitations. Defaults to 0. supports_bundled_mode (boolean, optional): If true, bundled mode will be used to query the endpoint. Defaults to True. persistence_file_path (str, optional): Sets the file path for the database that keeps track of past query activities (to comply with usage policies). Defaults to "rate_limits.db". agent (str, optional): The User-Agent for the HTTP request header. Defaults to SPARQLWrapper.__agent__. """ self.url = url self.timeout = timeout self.requests_per_min = requests_per_min self.retries = retries self.page_size = page_size self.supports_bundled_mode = supports_bundled_mode self.persistence_file_path = persistence_file_path self.query = sleep_and_retry( limits(calls=requests_per_min, period=60, storage=self.persistence_file_path, name='"' + url + '"')(self._query)) self.agent = agent
def __init__(self, source_language='', str_to_translate='', api_key='' ): self._source_language = source_language self._str_to_translate = str_to_translate self._api_key = api_key ratelimit_status = False self._rate_limit_status = ratelimit_status # Retries the requests after a certain time period has elapsed handler = on_exception(expo, RateLimitException, max_time=60, on_backoff=self._backoff_handler) # Establishes a rate limit for making requests to the Deep translation service limiter = limits(calls=60, period=60) self.translate_word = handler(limiter(self.translate_word)) self.reverse_translate = handler(limiter(self.reverse_translate))
def __init__(self, plan, throttle, block): Plan.__init__(self, plan) self.lock = Lock() scheme = (0, 0) self.throttling = True if throttle is None: self.throttling = False elif throttle == "minute": scheme = self.minute elif throttle == "daily": scheme = self.daily elif throttle == "monthly": scheme = self.monthly else: raise ValueError("Argument throttle must be either ") self.limit = limits(*scheme, raise_on_limit=block)(lambda: None) self.sleep = sleep_and_retry(self.limit) self.block = block
def decorator(func: Callable) -> Callable: limited = limits(calls=requests, period=seconds)(func) if async_: @wraps(func) async def inner(*args, **kwargs): while True: try: return await limited(*args, **kwargs) except RateLimitException: await asyncio.sleep(1) else: @wraps(func) def inner(*args, **kwargs): while True: try: return limited(*args, **kwargs) except RateLimitException: time.sleep(1) return inner
def __init__(self, source_language='', str_to_translate='', email_address=None): self._source_language = source_language self._str_to_translate = str_to_translate self._url_to_query = 'http://api.mymemory.translated.net/get' self._email_address = email_address self._headers = http_headers ratelimit_status = False self._rate_limit_status = ratelimit_status # Retries the requests after a certain time period has elapsed handler = on_exception(expo, RateLimitException, max_time=60, on_backoff=self._backoff_handler) # Establishes a rate limit for making requests to the MyMemory translation service limiter = limits(calls=30, period=60) self.translate_word = handler(limiter(self.translate_word)) self.reverse_translate = handler(limiter(self.reverse_translate))
def __init__(self, source_language='', str_to_translate='', proxies=None): self._source_language = source_language self._str_to_translate = str_to_translate self._url_to_query = 'https://translate.google.com/m' self._proxies = proxies rand_user_agent = get_random_user_agent() http_headers = {'user-agent': rand_user_agent} self._headers = http_headers ratelimit_status = False self._rate_limit_status = ratelimit_status # Retries the requests after a certain time period has elapsed handler = on_exception(expo, RateLimitException, max_time=60, on_backoff=self._backoff_handler) # Establishes a rate limit for making requests to the Google translation service limiter = limits(calls=60, period=60) self.translate_word = handler(limiter(self.translate_word)) self.reverse_translate = handler(limiter(self.reverse_translate))
def __init__(self, config_dict): self.session = requests.Session() self.api_key = config_dict.get('api_key') self.domain = config_dict.get('domain') self.start_date = config_dict.get('start_date') self.user_agent = config_dict.get('user_agent', const.USER_AGENT) self.rate_limit_requests = config_dict.get('rate_limit_requests', const.RATE_LIMIT_REQUESTS) self.rate_limit_seconds = config_dict.get('rate_limit_seconds', const.RATE_LIMIT_SECONDS) self.per_page = config_dict.get('per_page', const.PER_PAGE) self.max_retries = config_dict.get('max_retries', const.MAX_RETRIES) self.backoff_factor = config_dict.get('backoff_factor', const.BACKOFF_FACTOR) # usually we would use Python decorators on the request method, but since we want to change the arguments # for the decorators dynamically during runtime based on the provided config we have to override the # request method here self.request = limits(calls=self.rate_limit_requests, period=self.rate_limit_seconds)(self.request) self.request = sleep_and_retry(self.request) self.request = backoff.on_exception( backoff.expo, requests.exceptions.RequestException, max_tries=self.max_retries, giveup=lambda e: e.response is not None and 400 <= e.response.status_code < 500, factor=self.backoff_factor)(self.request)
def __init__( self, username=None, password=None, base_uri=BASE_URI, auth_uri=AUTH_URI, timeout=TIMEOUT, loglevel=None, rate_limit=True, ): self.base_uri = base_uri self.auth_uri = auth_uri self.username = username self.password = password self._user_id = None self.timeout = timeout self.refresh_token = None self.session = requests.Session() self._set_tokens() if loglevel: # pragma: no cover logging.basicConfig( level=loglevel, format="%(asctime)s %(levelname)-8s %(name)-25s %(message)s", ) else: logger.addHandler(logging.NullHandler()) self.documents = DocumentClient(self) self.projects = ProjectClient(self) self.users = UserClient(self) self.organizations = OrganizationClient(self) if rate_limit: self._request = ratelimit.limits(calls=RATE_LIMIT, period=RATE_PERIOD)(self._request)
def __init__(self, base_url, client_id, client_secret, client_scope, api_json = None): # type: (str, str, str, str, str) -> None """[Init method used to create an Api Class for making api calls] :param base_url: Base URL of the API service :type base_url: str :param client_id: Client ID of the application :type client_id: str :param client_secret: Secret of the application :type client_secret: str :param client_scope: Client scope, must be present in the api_json file :type client_scope: str :param api_json: API file defining all JSON limits and calls, defaults to None because it will use the default :param api_json: str, optional :raises Exception: If this cannot be configured with parameters used """ self.base_url = base_url self.client_id = client_id self.client_secret = client_secret self.client_scope = client_scope # If the user doesn't pass an alternate API file use the included one if not api_json: api_json = pkg_resources.resource_filename(__name__, 'apis.json') with open(api_json, encoding='utf-8') as api_file: apis = json.loads(api_file.read()) if client_scope in apis: api = apis.get(client_scope) self.token_url = api.get('token_url') self.api_call = sleep_and_retry(limits(calls=api.get('limits_calls'), period=api.get('limits_period'))(self._api_call)) self.access_token = self.get_access_token(self.token_url) else: raise Exception(f"Scope {client_scope} not in known API dict")
def __init__(self, expected_filename, _logger, warcinfo_record_data=None, program_name='WebArticleCurator', user_agent=None, overwrite_warc=True, err_threshold=10, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False, verify_request=True, stay_offline=False): # Store variables self._logger = _logger self._req_headers = { 'Accept-Encoding': 'identity', 'User-agent': user_agent } self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal # Setup download function if not stay_offline: self.download_url = self._download_url else: self.download_url = self._dummy_download_url if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() self.good_urls = set() # Setup target file handle filename = self._set_target_filename(expected_filename, overwrite_warc) self._logger.log('INFO', 'Creating archivefile:', filename) self._output_file = open(filename, 'wb') self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies self._verify_request = verify_request if not self._verify_request: disable_warnings(InsecureRequestWarning) # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry( limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._writer = WARCWriter(self._output_file, gzip=True, warc_version='WARC/1.1') if warcinfo_record_data is None: # Or use the parsed else custom headers will not be copied # INFO RECORD # Some custom information about the warc writer program and its settings warcinfo_record_data = { 'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.1', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1-1_latestdraft.pdf' } info_record = self._writer.create_warcinfo_record( filename, warcinfo_record_data) self._writer.write_record(info_record)
import os from ratelimit import sleep_and_retry, limits from mp_api.core.settings import MAPISettings DEFAULT_ENDPOINT = os.environ.get( "MP_API_ENDPOINT", "https://api.materialsproject.org/" ) def check_limit(): """ Empty function for enabling global rate limiting. """ return if "api.materialsproject" in DEFAULT_ENDPOINT: check_limit = limits(calls=MAPISettings().requests_per_min, period=60)(check_limit) check_limit = sleep_and_retry(check_limit)
import re from bs4 import BeautifulSoup from ratelimit import limits, sleep_and_retry from requests import get from tqdm import tqdm from yaml import dump from toolz.curried import comp data_path = 'data.yaml' country_list_url = 'https://simple.wikipedia.org/wiki/List_of_European_countries' border_list_url = 'https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_borders' base_url = 'https://en.wikipedia.org/wiki/' wikilimiter = comp(sleep_and_retry, limits(1, 1)) @wikilimiter def download_countries(): country_list_page = BeautifulSoup(get(country_list_url).text, features='html.parser') country_table = [row.find_all('a', href=re.compile(r'^/wiki/'))[1:] for row in country_list_page.find('tbody').find_all('tr')[1:]] return {str(a[0].string): str(a[-1]['href']).replace('/wiki/', '') for a in country_table} @wikilimiter def download_borders(): border_list_page = BeautifulSoup(get(border_list_url).text, features='html.parser') return dict(((q:=[str(a.string) for a in row.find_all('a', href=re.compile(r'^/wiki/')) if str(a.string)[0].isupper()])[0], q[1:]) for row in border_list_page.find('tbody').find_all('tr')[2:] if not ('overseas' in str(row) and 'excluding' not in str(row))) @wikilimiter def download_coords(capital): capital_page = BeautifulSoup(get(base_url + capital).text, features='html.parser')
'xml': _xml,'df':_data_frame, 'csv':_csv,'numpy':_numpy, 'tab': _tab,'pipe': _pipe} else: dispatch = {'dict': _dict,'json': _json,'xml': _xml, } return dispatch[response_type] def _get_request(url_root,api_key,path,response_type,params, ssl_verify): """ Helper funcation that requests a get response from FRED. """ url = _url_builder(url_root,api_key,path,params) content = _fetch(url, ssl_verify) response = _dispatch(response_type)(content) return response if _USE_JOBLIB_CACHE: import joblib one_gb = 1000000000 location = '/tmp/joblib_cache' memory = joblib.Memory(location, verbose=1, bytes_limit=one_gb) if _THROTTLE_REQUESTS: from ratelimit import limits, sleep_and_retry period_seconds = 1 calls_per_second = 20 _get_request = memory.cache(sleep_and_retry(limits(calls=calls_per_second, period=period_seconds)(_get_request))) else: _get_request = memory.cache(_get_request)
def add_rate_limiting(self, f: Callable): g = sleep_and_retry(f) g = limits(calls=self.ratelimit_calls_per_min, period=60)(g) return g