def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, timeout=60): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.cache = cache self.num_retries = None # we will set this per request self.timeout = timeout LOGIN_URL = 'http://www.jobbole.com/wp-admin/admin-ajax.php' LOGIN_EMAIL = 'caicai' LOGIN_PASSWORD = '******' postdata = urllib.parse.urlencode({'user_login': LOGIN_EMAIL, 'user_pass': LOGIN_PASSWORD,'action':'user_login' ,'remember_me':'1','redirect_url':'http://www.jobbole.com/'}).encode('utf-8') req = urllib.request.Request(LOGIN_URL,postdata) req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:55.0) Gecko/20100101 Firefox/55.0') urllib.request.ProxyHandler(proxies=proxies) #create CookieJar cjar = http.cookiejar.CookieJar() #create opener opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar)) #open 安装为全局 urllib.request.install_opener(opener) file = opener.open(req) data=file.read() file=open('3.html','wb') file.write(data) file.close()
def __init__(self, since = 1, th_sec = 5): if since not in [-1,1,3,7,14]: raise Exception('SinceLevelError: since is out of range [-1,1,3,7,14].') self.since = since self.thr = Throttle(th_sec) self.url_start = 'https://www.otodom.pl/wynajem/mieszkanie/warszawa/?' if self.since == -1 else "https://www.otodom.pl/wynajem/mieszkanie/warszawa/?search%5Bdescription%5D=1&search%5Bcreated_since%5D=" + str(self.since) + "&search%5Bregion_id%5D=7&search%5Bsubregion_id%5D=197&search%5Bcity_id%5D=26" self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.137 Safari/537.36 OPR/67.0.3575.79' self.__get_max_page() self.links = Queue()
def main(sx_url, access_log_path=DEFAULT_ACCESS_LOG_PATH, upload_interval=DEFAULT_UPLOAD_INTERVAL): results = defaultdict(list) # { date: entries } maybe_upload_results = Throttle(upload_all_results, upload_interval) for data in listen_and_parse(access_log_path): key = data['datetime'].date() results[key].append(data) maybe_upload_results(results, sx_url)
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', proxies=None, delay=0.0001, max_depth=999999, max_count=10000): """ Recorre los link en profundidad """ i = 0 crawl_queue = [start_url] result = [] # Dict donde guardare las url visitadas para no volver a parsearlas seen = {} if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue and i <= 10000: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue if i > max_count: print('Skipping %s due to exceed limit count' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxies=proxies) if not html: continue i += 1 print(i) #Devuelve un item parecido a scrapy donde guardo la url y el texto plano, ademas de ##guardarlo en un fichero yield WikiItem(html, url) # Filtramos los link a usar for link in get_links(html): if re.match('#(a-z)*', link): continue if re.match(link_regex, link): # Un pequeno parche que la wiki local al pedirle los link no me ponia esta A # en una pagina online no tuve problema al quitarlo #abs_link2 = urljoin(start_url, 'A/') # abs_link = urljoin(abs_link2, link) abs_link = urljoin(start_url, link) if abs_link not in seen and len(abs_link) < 200: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url)
def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, num_retries=DEFAULT_RETRIES, cache=DEFAULT_CACHE, proxies=DEFAULT_PROXIES, opener=DEFAULT_OPENER, timeout=DEFAULT_TIMEOUT): socket.setdefaulttimeout(timeout) self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache self.opener = opener
def __init__(self, delay=5, user_agent=None, num_retries=1, proxies=None, cache=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache
def main_link_crawler(start_url, link_regex, robots_url=None, user_agent='bbbbbbb', proxies=None, delay=3, max_depth=4, num_retries=2, cache={}): """ Crawl from the given start URL following links matched by link_regex. In the current implementation, we do not actually scrapy any information. args: start_url (str): web site to start crawl link_regex (str): regex to match for links kwargs: robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) user_agent (str): user agent (default: wswp) proxy (str): proxy url, ex 'http://IP' (default: None) delay (int): seconds to throttle between requests to one domain (default: 3) max_depth (int): maximum crawl depth (to avoid traps) (default: 4) scrape_callback (function): function to call after each download (default: None) """ crawl_queue = [start_url] # keep track which URL's have seen before seen = {} data = [] if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxy=proxies) if not html: continue # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): abs_link = urljoin(start_url, link) if abs_link not in seen: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url) return seen
def __init__(self, delay=1, user_agent='saint_data', cache={}): """ __init__ method initializes a Downloader object @parameters user_agent: (str) user agent for request header cache: (dict) stores all downloaded """ self.throttle = Throttle(delay) self.user_agent = user_agent self.num_retries = None # this variable will be set later by request (in __call__ method) self.cache = cache
def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, timeout=60): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.cache = cache self.num_retries = None # we will set this per request self.timeout = timeout
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', proxies=None, delay=5, max_depth=5): """ Crawl from the given start URL following links matched by link_regex. In the current implementation, we do not actually scrape any information. args: start_url (str): web site to start crawl link_regex (str): regex to match for links kwargs: robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) user_agent (str): user agent (default: wswp) proxies (dict): proxy dict w/ keys 'http' and 'https', values are strs (i.e. 'http(s)://IP') (default: None) delay (int): seconds to throttle between requests to one domain (default: 3) max_depth (int): maximum crawl depth (to avoid traps) (default: 4) """ crawl_queue = [start_url] # keep track which URL's have seen before seen = {} if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxies=proxies) if not html: continue # TODO: add actual data scraping here # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): abs_link = urljoin(start_url, link) if abs_link not in seen: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url)
def __init__(self, user_angent='wsap', proxies=None, delay=5, numTry=5, cache=None, timeout=30): self.user_agent = user_angent self.proxies = proxies self.delay = delay self.numTry = numTry self.cache = RedisCache() self.throt = Throttle(delay) self.timeOut = timeout
def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=1, cache=None, opener=None, timeout=30): socket.setdefaulttimeout(timeout) self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache self.opener = opener
def link_crawler(seed_url, link_regex=None, delay=10, headers=None, max_depth=2, max_urls=1, user_agent='wswp', proxy=None, num_retries=1): """ :param seed_url: a list of master url :param link_regex: you wanna filter the url :return: a list of contain master_url and sub url """ crawl_queue = Queue.deque([seed_url]) # 还需要被爬取的url, 类似一个列表 seen = {seed_url: 0} # 用于存储根url深度,默认为0,以及其他子url的深度 num_urls = 0 # 跟踪已经下载的url的数量 # robots file parse for get really url rp = get_robots(seed_url) # 生成延时器对象 throttle = Throttle(delay) # 请求头字典 headers = headers or {} if user_agent: headers['User-agent'] = user_agent # 将自定义的用户请求头添加到字典中 while crawl_queue: # 只要crawl_queue没被pop从后向前取完,就执行循环 url = crawl_queue.pop() # 只取最新的url,由append而来的,取一次少一次。本次爬取的url if rp.can_fetch(user_agent, url): # 判断是否可爬,如果是False是不能爬的 throttle.wait(url) # 进行限时,10秒 html = download(url, headers, proxy=proxy, num_retries=num_retries) # 下载网页 links = [] # 用来存储匹配到的子url depth = seen[url] # 取seen这个字典的url的值,其值为深度数字。url为本次爬取的url,取得是本次url的深度值 if depth != max_depth: # 控制深度来决定加入待爬队列中的链接深度,没达到深度就可以往crawl_queue里append if link_regex: links.extend(link for link in get_link(html) if re.match(link_regex, link)) # 将子url扩展到列表 for link in links: # 遍历links列表中的url link = normalize(seed_url, link) # 将url碎片与根url拼接成完整的链接 if link not in seen: # 有一个新link, 原url深度+1并赋值给link的深度。可以记录该条link是url的第几层,或者第几个次子链接 seen[link] = depth + 1 # 深度,url与link之间的相似度,如相似度一样,就表示重复,深度为+1。存下次要下载的深度值 if same_domain(seed_url, url): # 判断domain是否一样,即是域名+端口 crawl_queue.append(link) # 将链接加到待爬的队列中 # 该link链接是根链接的一个子链接,将子链接加到待爬的队列中,然后num_urls通过控制总的下载次数,来确定爬多少个url,也就是深度 num_urls += 1 # 控制下载次数 if num_urls == max_urls: # 控制循环的次数 break else: print 'Blocked by robots.txt:', url print seen
def __init__(self): self.code_type_to_message = {} self.code_type_to_type = {} self.type_to_code_type = {} self.type_to_reserved_count = {} self.type_to_released_count = {} self.add_message_type(Acknowledge()) self.add_message_type(Keepalive()) self.add_message_type(Throttle()) self.add_message_type(ChallengeRequest()) self.add_message_type(ChallengeResponse()) self.add_message_type(JoinRequest()) self.add_message_type(JoinResponse()) self.add_message_type(LeaveRequest()) self.add_message_type(LeaveResponse()) self.add_message_type(InjectRequest()) self.add_message_type(InjectResponse()) self.add_message_type(ModifyRequest()) self.add_message_type(ModifyResponse()) self.add_message_type(EjectRequest()) self.add_message_type(EjectResponse()) self.add_message_type(InteractRequest()) self.add_message_type(InteractResponse()) self.add_message_type(ExamineRequest()) self.add_message_type(ExamineResponse()) self.add_message_type(AttachRequest()) self.add_message_type(AttachResponse()) self.add_message_type(DetachRequest()) self.add_message_type(DetachResponse()) self.add_message_type(HandoverRequest()) self.add_message_type(HandoverResponse()) self.add_message_type(ListBubblesRequest()) self.add_message_type(ListBubblesResponse()) self.add_message_type(PerceptionEvent()) self.add_message_type(MovementEvent()) self.add_message_type(DisappearanceEvent()) self.add_message_type(HandoverEvent()) self.add_message_type(ActionEvent()) self.add_message_type(SynchronizationBeginEvent()) self.add_message_type(SynchronizationEndEvent())
def crawl_link(seed_url, link_regex, max_depth = 2, delay = 3, scrape_callback = None): crawl_queue = [seed_url] seen = {seed_url: 0} throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() throttle.wait(url) html = download(url) if html is None: return links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) #or []:代表追加的是个空列表 # check is max depth depth = seen[url] if depth != max_depth: for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) # check the has down if link not in seen: seen[link] = depth + 1 crawl_queue.append(link)
import json import threading import time import selenium import os from lxml.html import fromstring import random from link_crawler import download from selenium_auth import get_driver, login from throttle import Throttle from helpers import jsonify SLEEP_TIME = 2 CURRENT_DIR = os.getcwd() throttle = Throttle(3) # save cookies to json file to prevent multiple log ins try: with open('cookies') as f_in: cookies = json.load(f_in) except FileNotFoundError: driver = get_driver() login(driver) cookies = driver.get_cookies() driver.quit() with open('cookies', 'w') as f_out: json.dump(cookies, f_out) except: print('Something went wrong during log in') raise
def rotate_log_files(options): with request_lock(options['lock_file']) as acquired: if not acquired: logger.warn('Not rotating, previous job still underway') return # Check we can send signals to all relevant processes pids_for_processes = running_processes_by_name( options['reopen_file_signals'].keys()) unkillable_processes = set() for process_name in options['reopen_file_signals'].keys(): pids = pids_for_processes[process_name] try: for pid in pids: kill_if_running(pid, 0) except OSError: unkillable_processes.add(process_name) if unkillable_processes: logger.error('Cannot send signal to some processes, aborting: %s' % ', '.join(unkillable_processes)) return files_to_rotate = [ file for file in os.listdir(options['log_directory']) if fnmatch.fnmatch(file, options['filename_filter']) ] rotation_suffix = datetime.datetime.now().strftime( options['timestamp_format']) filename_mapping = { file: file + rotation_suffix for file in files_to_rotate } # Move all files rotated_files = [] for original_name, rotated_name in filename_mapping.items(): original_path = os.path.join(options['log_directory'], original_name) rotated_path = os.path.join(options['log_directory'], rotated_name) if not os.path.exists(rotated_path): os.rename(original_path, rotated_path) rotated_files.append(rotated_name) else: logger.warning( 'Did not rotate file. File called %s already existed', rotated_path) # Run kick commands pids_for_processes = running_processes_by_name( options['reopen_file_signals'].keys()) for process_name, signal_name in options['reopen_file_signals'].items( ): signal_id = getattr(signal, 'SIG' + signal_name.upper()) pids = pids_for_processes[process_name] for pid in pids: kill_if_running(pid, signal_id) throttle_file_checks = Throttle(FILE_OPEN_CHECK_INTERVAL) checks_without_closed_files = 0 s3_store = S3LogStore(options) # Get files which have no open handles and process them as soon as we can. # Files with open handles wait until next time through the loop. We throttle # to avoid checking too often. # TODO: Should we also pick up and retry copying any gz files which we could not # copy to s3 last time around? open_files = rotated_files while open_files: throttle_file_checks.wait() closed_files, open_files = check_for_open_files(open_files) for ready_file in closed_files: try: ready_path = os.path.join(options['log_directory'], ready_file) compressed_path = compress_file(ready_path) s3_store.store_file(compressed_path) os.unlink(compressed_path) except: logger.error('Unexpected error processing %s', ready_file, exc_info=True) if len(closed_files): checks_without_closed_files = 0 else: checks_without_closed_files += 1 if checks_without_closed_files > MAX_CHECKS_WITHOUT_FILE_CLOSED: logger.error( 'Gave up waiting for files to close. Open files: %s' % ', '.join(open_files)) return
from flask import (abort, after_this_request, Flask, request, render_template, url_for) from flask_cache import Cache from flask_jsonpify import jsonify from raven.contrib.flask import Sentry from twilio import TwilioRestException from models import db, aggregate_stats, log_call, call_count, call_list from political_data import PoliticalData from cache_handler import CacheHandler from fftf_leaderboard import FFTFLeaderboard from access_control_decorator import crossdomain, requires_auth try: from throttle import Throttle throttle = Throttle() except ImportError: throttle = None app = Flask(__name__) app.config.from_object('config.ConfigProduction') cache = Cache(app, config={'CACHE_TYPE': 'simple'}) sentry = Sentry(app) # db.init_app(app) # JL HACK ~ disable mysql # Optional Redis cache, for caching Google spreadsheet campaign overrides cache_handler = CacheHandler(app.config['REDIS_URL'])
html = self.cache[url].get("html") if self.cache[url] else None if html is None: html = self.downloader.download(url) self.cache.collection.update_one( {"_id": url}, {"$set": {"html": html}}, upsert=True ) self.throttle.wait(url) else: print("load user html from cache {}".format(current_user)) user_dict = self.parser.parse2dict(current_user, html) self.cache[current_user] = user_dict if len(url_queue) < 1000 - finished: add_user_in_queue(user_dict) if __name__ == "__main__": downloader = ZhihuDownloader() # phone_number = "your phone" # password = "******" mongo_cache = MongoCache(db_name="zhihu", collection_name="default") parser = ZhihuParser() saver = ZhihuSaver begin = Crawler( cache=mongo_cache, downloader=downloader, parser=parser, begin_url=BEGIN_URL, throttle=Throttle(delay=1), ) begin.run()
def create_message_typecode(message_type_code): if message_type_code == MXP_MESSAGE_ACKNOWLEDGE: return Acknowledge() if message_type_code == MXP_MESSAGE_KEEPALIVE: return Keepalive() if message_type_code == MXP_MESSAGE_THROTTLE: return Throttle() if message_type_code == MXP_MESSAGE_CHALLENGE_REQUEST: return ChallengeRequest() if message_type_code == MXP_MESSAGE_CHALLENGE_RESPONSE: return ChallengeResponse() if message_type_code == MXP_MESSAGE_JOIN_REQUEST: return JoinRequest() if message_type_code == MXP_MESSAGE_JOIN_RESPONSE: return JoinResponse() if message_type_code == MXP_MESSAGE_LEAVE_REQUEST: return LeaveRequest() if message_type_code == MXP_MESSAGE_LEAVE_RESPONSE: return LeaveResponse() if message_type_code == MXP_MESSAGE_INJECT_REQUEST: return InjectRequest() if message_type_code == MXP_MESSAGE_INJECT_RESPONSE: return InjectResponse() if message_type_code == MXP_MESSAGE_MODIFY_REQUEST: return ModifyRequest() if message_type_code == MXP_MESSAGE_MODIFY_RESPONSE: return ModifyResponse() if message_type_code == MXP_MESSAGE_EJECT_REQUEST: return EjectRequest() if message_type_code == MXP_MESSAGE_EJECT_RESPONSE: return EjectResponse() if message_type_code == MXP_MESSAGE_INTERACT_REQUEST: return InteractRequest() if message_type_code == MXP_MESSAGE_INTERACT_RESPONSE: return InteractResponse() if message_type_code == MXP_MESSAGE_EXAMINE_REQUEST: return ExamineRequest() if message_type_code == MXP_MESSAGE_EXAMINE_RESPONSE: return ExamineResponse() if message_type_code == MXP_MESSAGE_ATTACH_REQUEST: return AttachRequest() if message_type_code == MXP_MESSAGE_ATTACH_RESPONSE: return AttachResponse() if message_type_code == MXP_MESSAGE_DETACH_REQUEST: return DetachRequest() if message_type_code == MXP_MESSAGE_DETACH_RESPONSE: return DetachResponse() if message_type_code == MXP_MESSAGE_HANDOVER_REQUEST: return HandoverRequest() if message_type_code == MXP_MESSAGE_HANDOVER_RESPONSE: return HandoverResponse() if message_type_code == MXP_MESSAGE_LIST_BUBBLES_REQUEST: return ListBubblesRequest() if message_type_code == MXP_MESSAGE_LIST_BUBBLES_RESPONSE: return ListBubblesResponse() if message_type_code == MXP_MESSAGE_PERCEPTION_EVENT: return PerceptionEvent() if message_type_code == MXP_MESSAGE_MOVEMENT_EVENT: return MovementEvent() if message_type_code == MXP_MESSAGE_DISAPPEARANCE_EVENT: return DisappearanceEvent() if message_type_code == MXP_MESSAGE_HANDOVER_EVENT: return HandoverEvent() if message_type_code == MXP_MESSAGE_ACTION_EVENT: return ActionEvent() if message_type_code == MXP_MESSAGE_SYNCHRONIZATION_BEGIN_EVENT: return SynchronizationBeginEvent() if message_type_code == MXP_MESSAGE_SYNCHRONIZATION_END_EVENT: return SynchronizationEndEvent()
def setUp(self): self.mock_time = MockTime() self.test_throttle = Throttle(10, time=self.mock_time)
from throttle import Throttle from download import download, search_codes from string import ascii_lowercase import re import csv import itertools throttle = Throttle(0) """ So, money.rediff.com has the simplest structure of scripcodes I found. All scrips are categorized by the first alphabet. In this code I am first finding out how many stocks are there starting with each alphabet. The count of the stocks is being recorded in the list 'index' below """ index = [] for x in ascii_lowercase: throttle.wait('https://money.rediff.com') html = str(download('https://money.rediff.com/companies/{}'.format(x))) len = re.search('>Showing 1 - (.*?) of (.*?) ', html) index.append(int(len.group(2))) """ Once I have all the stocks by their alphabet, I am iterating through every page on the structure to find the regex for scripcode, which is a 6 digit number. I know the variables look ugly, but the code is fucntional, and will only be run once in a blue moon. I'll improve on it later on, if I get time. Basically, this is an unintelligent iterative crawler. """ ctr = 0 b = [] prod = [] for x in ascii_lowercase: throttle.wait('https://money.rediff.com') for i in itertools.count(1, 200): limit = index[ctr] if (i > limit): break b = search_codes('https://money.rediff.com/companies/{}/{}-{}'.format(
def link_crawler(start_url, link_regex, robots_url=None, user_agent='statista', max_depth=-1, delay=3, proxies=None, num_retries=2, cache=None, scraper_callback=None): #: Initialze a crawl queue with a seed url to start the crawl from crawl_queue = [start_url] #: keep track of seen urls seen = {} robots = {} throttle = Throttle(delay) #: start the crawl while crawl_queue: url = crawl_queue.pop() #: robots.txt robots_file_present = False if 'http' not in url: continue #: Get the domain domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc) #: Get the robot parser for this domain from the robots dictionary robot_parser = robots.get(domain) #: set a default robots url and a parser for it if there isn't one if not robot_parser and domain not in robots: robots_url = '{}/robots.txt'.format(domain) robot_parser = get_robots_parser(robots_url) if not robot_parser: #: continue to crawl even if there are problems finding robots.txt #: file robots_file_present = True # associate each domain with a corresponding parser, whether # present or not robots[domain] = robot_parser elif domain in robots: robots_file_present = True #: crawl only when url passes robots.txt restrictions if robots_file_present or robot_parser.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: #: Skip link if you have crawled it more than max depth print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, num_retries=num_retries) if not html: continue if scraper_callback: scraper_callback(url, html) #: Get all links from page and filter only those matching given pattern for link in get_links(html): if re.search(link_regex, link): if 'http' not in link: # check if link is well formed and correct if link.startswith('//'): link = '{}:{}'.format(urlparse(url).scheme, link) elif link.startswith('://'): link = '{}{}'.format(urlparse(url).scheme, link) else: link = urljoin(domain, link) if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) else: print('Blocked by robots.txt:', url)
date_for_load = datetime.datetime.now() if args.d: try: args_date = datetime.datetime.strptime(args.d, "%d/%m/%Y") # logging.debug("args date is: ") # logging.debug(args_date) date_for_load = args_date except Exception as e: logging.error("Invalid command line parameter:") logging.error(args) logging.error(e) date_for_load = datetime.datetime.now() # init delay throttle = Throttle(config.delay) # create new loader instance ldr_kz_nb = Loader_KZ_NB() ldr_kz_bai_alfa = Loader_KZ_bai_alfa() kz_bai_halyk_cash_ldr = Loader_KZ_bai_halyk_cash() kz_bai_halyk_cards_ldr = Loader_KZ_bai_halyk_cards() kz_bai_kkb_cash_ldr = Loader_KZ_bai_kkb_cash() kz_bai_kkb_cards_ldr = Loader_KZ_bai_kkb_cards() # here is the place for adding an instance into the loaders list loaders_list = [ ldr_kz_nb, ldr_kz_bai_alfa, kz_bai_halyk_cash_ldr, kz_bai_halyk_cards_ldr, kz_bai_kkb_cash_ldr, kz_bai_kkb_cards_ldr ]