def site_check_process_wraper(func, func_arg=(), func_kwarg=None, callback=None, Memlimit=200, external_stop=None): mem_pro = MemoryControlPs(func, func_arg, func_kwarg, callback, Memlimit, external_stop) mem_pro.start()
def _filtering_process_wrapper(self): self.filter_process = MemoryControlPs( func=filtering_process, func_kwargs=FilterController.get_input_parameters( "filtering.db", get_recovery_dir_path(), self._filter_input_queue, self._filter_output_queue, self._stop_event, self._filter_matrix, self._accounts, self._filtering_only, self._filtering_offset, self._filtering_total), external_stop_event=self._stop_event) self.filter_process.start()
def checking_whois(): # optinmal = self.max_prcess * self.concurrent_page/5 optinmal = 260 * 3 / 5 if optinmal < 10: worker_number = 10 else: worker_number = int(optinmal) mem_limit = 1000 if mem_limit < 200: mem_limit = 200 stop_event = Event() kwargs = {"is_debug": True, "stop_event": stop_event, "max_worker": worker_number} whois_process_wrapper = MemoryControlPs( whois_process, func_kwargs=kwargs, mem_limit=mem_limit, external_stop_event=stop_event ) whois_process_wrapper.start()
def checking_whois(self): optinmal = self.max_prcess * self.concurrent_page / 5 if optinmal < 10: worker_number = 10 else: worker_number = int(optinmal) mem_limit = self.memory_limit_per_process / 2 if mem_limit < 200: mem_limit = 200 self.whois_process = MemoryControlPs( whois_process, func_kwargs=WhoisChecker.get_input_parameters( self._whoisQueue, self.outputQueue, self.stop_event, worker_number), mem_limit=mem_limit, external_stop_event=self.stop_event) self.whois_process.start()
def checking_whois(): # optinmal = self.max_prcess * self.concurrent_page/5 optinmal = 260 * 3 / 5 if optinmal < 10: worker_number = 10 else: worker_number = int(optinmal) mem_limit = 1000 if mem_limit < 200: mem_limit = 200 stop_event = Event() kwargs = { "is_debug": True, "stop_event": stop_event, "max_worker": worker_number } whois_process_wrapper = MemoryControlPs(whois_process, func_kwargs=kwargs, mem_limit=mem_limit, external_stop_event=stop_event) whois_process_wrapper.start()
def checking_whois(self): optinmal = self.max_prcess * self.concurrent_page/5 if optinmal < 10: worker_number = 10 else: worker_number = int(optinmal) mem_limit = self.memory_limit_per_process/2 if mem_limit < 200: mem_limit = 200 self.whois_process = MemoryControlPs(whois_process, func_kwargs=WhoisChecker.get_input_parameters(self._whoisQueue, self.outputQueue, self.stop_event, worker_number), mem_limit=mem_limit, external_stop_event=self.stop_event) self.whois_process.start()
def _filtering_process_wrapper(self): self.filter_process = MemoryControlPs(func=filtering_process, func_kwargs=FilterController.get_input_parameters("filtering.db", get_recovery_dir_path(), self._filter_input_queue, self._filter_output_queue, self._stop_event, self._filter_matrix, self._accounts, self._filtering_only, self._filtering_offset, self._filtering_total), external_stop_event=self._stop_event) self.filter_process.start()
class SiteCheckProcessManager(Thread, SiteCheckerController): MEM_MINIMUM_REQ = 100 def __init__(self, job_name: str="", input_Q:multiprocessing.Queue=None, max_procss=4, concurrent_page=1, page_max_level=10, max_page_per_site=1000, output_delegate=None, memory_limit_per_process=100, **kwargs): """ :param job_name: :param input_Q: :param max_procss: :param concurrent_page: :param page_max_level: :param max_page_per_site: :param output_delegate: :param memory_limit_per_process: if value is less than 100, throw ValueException :param kwargs: :return: """ Thread.__init__(self) #FeedbackInterface.__init__(**kwargs) #super(SiteCheckProcessManager, self).__init__(**kwargs) #self.process_queue = multiprocessing.Queue() self.name = job_name if max_procss <= 0: max_procss = 1 self.max_prcess = max_procss if input_Q is None: self.inputQueue = multiprocessing.Queue() else: self.inputQueue = input_Q self.outputQueue = multiprocessing.Queue() self._whoisQueue = multiprocessing.Queue() #self.output_lock = threading.RLock() #self.tempList = site_list # if there is a need to add new sites during scripting, add to this list self.processPrfix = "Process-" self.threadPrfix = "Thread-" self.page_max_level = page_max_level self.max_page_per_site = max_page_per_site if output_delegate is None: self.output_delegate = self.default_delegate else: self.output_delegate = output_delegate # delegate of type f(x:OnSiteLink) self.stop_event = multiprocessing.Event() self.finished = False self.pool = ThreadPool(processes=self.max_prcess) #self.pool = multiprocessing.Pool(processes=self.max_prcess) self.output_thread = None self.job_all = 0 self.job_done = 0 self.job_waiting = 0 self.total_page_done = 0 self.page_per_sec = 0 # need to do this self.average_page_per_site = 0 self.patch_limit = self.max_prcess self.temp_results = [] self.site_info = [] # collect site info after the job done self.db_trash_list = [] self.concurrent_page = concurrent_page self.continue_lock = threading.RLock() self.db_trash_lock = threading.RLock() self.state_lock = threading.RLock() self.temp_result_lock = threading.RLock() self.site_info_lock = threading.RLock() if memory_limit_per_process < SiteCheckProcessManager.MEM_MINIMUM_REQ: ex = ValueError("minimum memory requirement to run the crawler is 100 MB, otherwise too many memory control looping.") msg = "error in SiteCheckProcessManager.__init__(), with database: " + job_name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) self.memory_limit_per_process = memory_limit_per_process self.whois_process = None self.whois_queue_process = Process(target=run_queue_server) #self.input_iter = SiteInputIter(self.inputQueue, self, self.concurrent_page, self.page_max_level, # self.max_page_per_site, self.outputQueue, self.process_site_info) self.input_iter = SiteInputIter(self.inputQueue, func=site_check_process, external_stop=self.stop_event) def _create_all_file_dirs(self): try: FileHandler.create_file_if_not_exist(get_log_dir()) FileHandler.create_file_if_not_exist(get_recovery_dir_path()) FileHandler.create_file_if_not_exist(get_temp_db_dir()) FileHandler.create_file_if_not_exist(get_task_backup_dir()) FileHandler.create_file_if_not_exist(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("SiteCheckProcessManager", ex, "_create_all_file_dirs()") def clear_cache(self): try: FileHandler.clear_dir(get_log_dir()) FileHandler.clear_dir(get_recovery_dir_path()) FileHandler.clear_dir(get_temp_db_dir()) FileHandler.clear_dir(get_task_backup_dir()) FileHandler.clear_dir(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("SiteCheckProcessManager", ex, "clear_cache()") def set_system_limit(self): try: os.system('sudo -s') os.system('ulimit -n 204800') # os.system('ulimit -s 1024') except Exception as ex: print(ex) def get_temp_result_count(self): #with self.temp_result_lock: return len(self.temp_results) def get_temp_result_and_clear(self) -> []: with self.temp_result_lock: copied = self.temp_results.copy() self.temp_results.clear() return copied def default_delegate(self, result): with self.temp_result_lock: if isinstance(result, OnSiteLink): self.temp_results.append(result) # make no difference #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ]) elif isinstance(result, str): self.temp_results.append(result) elif isinstance(result, tuple) and len(result) == 2: temp = OnSiteLink(result[0], result[1]) print("new domain:", temp) self.temp_results.append(temp) else: pass def get_state(self) -> SiteCheckProcessState: print("get state from slave crawler") with self.state_lock: state = SiteCheckProcessState(self.job_all, self.job_done, self.job_waiting, self.total_page_done, self.average_page_per_site, self.get_temp_result_count()) print("get state from slave crawler finished") return state def get_filter_progress(self): if isinstance(self.whois_process, MemoryControlPs): state = self.whois_process.get_last_state() if isinstance(state, WhoisCheckerState): return state.progress_count, state.data_total else: return 0, 0 else: return 0, 0 def clear_trash(self): # run with a thread while not self.stop_event.is_set(): with self.db_trash_lock: removed_list = [] trash_len = len(self.db_trash_list) if trash_len > 0: for item in self.db_trash_list: if TempDBInterface.force_clear(item): #print("removed trash:", item) removed_list.append(item) for removed_item in removed_list: self.db_trash_list.remove(removed_item) CsvLogger.log_to_file("job_finished", [(x, str(datetime.datetime.now())) for x in removed_list], get_task_backup_dir()) removed_list.clear() time.sleep(2) def put_to_input_queue(self, data: []): if data is not None: for item in data: self.inputQueue.put(item) self.job_all += 1 def get_site_info_list_and_clear(self): with self.site_info_lock: copied = self.site_info.copy() self.site_info.clear() return copied def get_site_info_list_count(self): return len(self.site_info) def process_site_info(self, site_info): if site_info is not None: with self.site_info_lock: PrintLogger.print("finished site info: " + str(site_info.__dict__)) self.site_info.append(site_info) def process_feedback(self, feedback: SiteFeedback): self.add_page_done(feedback.page_done) if feedback.finished: # print("should process feedback!") self.site_finished() self.process_site_info(feedback.seed_feedback) with self.db_trash_lock: self.db_trash_list.append(feedback.datasource_ref) self.db_trash_list.append(feedback.datasource_ref+".ext.db") def add_page_done(self, number_page_done: int): # make sure it is thread safe with self.state_lock: self.total_page_done += number_page_done time.sleep(0.001) def site_finished(self): # print("one more site done") with self.state_lock: self.job_done += 1 self.average_page_per_site = self.total_page_done/self.job_done time.sleep(0.001) def set_stop(self): self.stop_event.set() def can_continue(self): return not self.stop_event.is_set() def checking_whois(self): optinmal = self.max_prcess * self.concurrent_page/5 if optinmal < 10: worker_number = 10 else: worker_number = int(optinmal) mem_limit = self.memory_limit_per_process/2 if mem_limit < 200: mem_limit = 200 self.whois_process = MemoryControlPs(whois_process, func_kwargs=WhoisChecker.get_input_parameters(self._whoisQueue, self.outputQueue, self.stop_event, worker_number), mem_limit=mem_limit, external_stop_event=self.stop_event) self.whois_process.start() def queue_failure_reset(self): manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) return self.outputQueue def run(self): # self.set_system_limit() self._create_all_file_dirs() self.whois_queue_process.start() whois_thread = Thread(target=self.checking_whois) trash_clean_thread = Thread(target=self.clear_trash) manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue, # delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread = outputThread(threadID=0, name=self.threadPrfix+"Output", stop_event=self.stop_event, inputQ=self.outputQueue, delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread.start() trash_clean_thread.start() whois_thread.start() # self.whois_queue_process.start() self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(full_link="", # this parameter will be updated in self.input_iter max_page=self.max_page_per_site, max_level=self.page_max_level, output_queue=self._whoisQueue, pool_size=self.concurrent_page) self.input_iter.callback = self.process_feedback self.input_iter.Memlimit = self.memory_limit_per_process try: #print("monitor process started: pid: ", os.getpid()) self.pool.imap(site_check_process_iter, self.input_iter, 1) #self.pool.imap_unordered(site_check_process_iter, self.input_iter) while self.can_continue(): time.sleep(0.5) except Exception as ex: msg = "run(), with database: " + self.name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) finally: print("terminate miner!") self.pool.terminate() whois_thread.join() self.whois_queue_process.terminate() self.temp_results.clear() self.site_info.clear() self.finished = True
class SiteCheckProcessManager(Thread, SiteCheckerController): MEM_MINIMUM_REQ = 100 def __init__(self, job_name: str = "", input_Q: multiprocessing.Queue = None, max_procss=4, concurrent_page=1, page_max_level=10, max_page_per_site=1000, output_delegate=None, memory_limit_per_process=100, **kwargs): """ :param job_name: :param input_Q: :param max_procss: :param concurrent_page: :param page_max_level: :param max_page_per_site: :param output_delegate: :param memory_limit_per_process: if value is less than 100, throw ValueException :param kwargs: :return: """ Thread.__init__(self) #FeedbackInterface.__init__(**kwargs) #super(SiteCheckProcessManager, self).__init__(**kwargs) #self.process_queue = multiprocessing.Queue() self.name = job_name if max_procss <= 0: max_procss = 1 self.max_prcess = max_procss if input_Q is None: self.inputQueue = multiprocessing.Queue() else: self.inputQueue = input_Q self.outputQueue = multiprocessing.Queue() self._whoisQueue = multiprocessing.Queue() #self.output_lock = threading.RLock() #self.tempList = site_list # if there is a need to add new sites during scripting, add to this list self.processPrfix = "Process-" self.threadPrfix = "Thread-" self.page_max_level = page_max_level self.max_page_per_site = max_page_per_site if output_delegate is None: self.output_delegate = self.default_delegate else: self.output_delegate = output_delegate # delegate of type f(x:OnSiteLink) self.stop_event = multiprocessing.Event() self.finished = False self.pool = ThreadPool(processes=self.max_prcess) #self.pool = multiprocessing.Pool(processes=self.max_prcess) self.output_thread = None self.job_all = 0 self.job_done = 0 self.job_waiting = 0 self.total_page_done = 0 self.page_per_sec = 0 # need to do this self.average_page_per_site = 0 self.patch_limit = self.max_prcess self.temp_results = [] self.site_info = [] # collect site info after the job done self.db_trash_list = [] self.concurrent_page = concurrent_page self.continue_lock = threading.RLock() self.db_trash_lock = threading.RLock() self.state_lock = threading.RLock() self.temp_result_lock = threading.RLock() self.site_info_lock = threading.RLock() if memory_limit_per_process < SiteCheckProcessManager.MEM_MINIMUM_REQ: ex = ValueError( "minimum memory requirement to run the crawler is 100 MB, otherwise too many memory control looping." ) msg = "error in SiteCheckProcessManager.__init__(), with database: " + job_name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) self.memory_limit_per_process = memory_limit_per_process self.whois_process = None self.whois_queue_process = Process(target=run_queue_server) #self.input_iter = SiteInputIter(self.inputQueue, self, self.concurrent_page, self.page_max_level, # self.max_page_per_site, self.outputQueue, self.process_site_info) self.input_iter = SiteInputIter(self.inputQueue, func=site_check_process, external_stop=self.stop_event) def _create_all_file_dirs(self): try: FileHandler.create_file_if_not_exist(get_log_dir()) FileHandler.create_file_if_not_exist(get_recovery_dir_path()) FileHandler.create_file_if_not_exist(get_temp_db_dir()) FileHandler.create_file_if_not_exist(get_task_backup_dir()) FileHandler.create_file_if_not_exist(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("SiteCheckProcessManager", ex, "_create_all_file_dirs()") def clear_cache(self): try: FileHandler.clear_dir(get_log_dir()) FileHandler.clear_dir(get_recovery_dir_path()) FileHandler.clear_dir(get_temp_db_dir()) FileHandler.clear_dir(get_task_backup_dir()) FileHandler.clear_dir(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("SiteCheckProcessManager", ex, "clear_cache()") def set_system_limit(self): try: os.system('sudo -s') os.system('ulimit -n 204800') # os.system('ulimit -s 1024') except Exception as ex: print(ex) def get_temp_result_count(self): #with self.temp_result_lock: return len(self.temp_results) def get_temp_result_and_clear(self) -> []: with self.temp_result_lock: copied = self.temp_results.copy() self.temp_results.clear() return copied def default_delegate(self, result): with self.temp_result_lock: if isinstance(result, OnSiteLink): self.temp_results.append(result) # make no difference #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ]) elif isinstance(result, str): self.temp_results.append(result) elif isinstance(result, tuple) and len(result) == 2: temp = OnSiteLink(result[0], result[1]) print("new domain:", temp) self.temp_results.append(temp) else: pass def get_state(self) -> SiteCheckProcessState: print("get state from slave crawler") with self.state_lock: state = SiteCheckProcessState(self.job_all, self.job_done, self.job_waiting, self.total_page_done, self.average_page_per_site, self.get_temp_result_count()) print("get state from slave crawler finished") return state def get_filter_progress(self): if isinstance(self.whois_process, MemoryControlPs): state = self.whois_process.get_last_state() if isinstance(state, WhoisCheckerState): return state.progress_count, state.data_total else: return 0, 0 else: return 0, 0 def clear_trash(self): # run with a thread while not self.stop_event.is_set(): with self.db_trash_lock: removed_list = [] trash_len = len(self.db_trash_list) if trash_len > 0: for item in self.db_trash_list: if TempDBInterface.force_clear(item): #print("removed trash:", item) removed_list.append(item) for removed_item in removed_list: self.db_trash_list.remove(removed_item) CsvLogger.log_to_file("job_finished", [(x, str(datetime.datetime.now())) for x in removed_list], get_task_backup_dir()) removed_list.clear() time.sleep(2) def put_to_input_queue(self, data: []): if data is not None: for item in data: self.inputQueue.put(item) self.job_all += 1 def get_site_info_list_and_clear(self): with self.site_info_lock: copied = self.site_info.copy() self.site_info.clear() return copied def get_site_info_list_count(self): return len(self.site_info) def process_site_info(self, site_info): if site_info is not None: with self.site_info_lock: PrintLogger.print("finished site info: " + str(site_info.__dict__)) self.site_info.append(site_info) def process_feedback(self, feedback: SiteFeedback): self.add_page_done(feedback.page_done) if feedback.finished: # print("should process feedback!") self.site_finished() self.process_site_info(feedback.seed_feedback) with self.db_trash_lock: self.db_trash_list.append(feedback.datasource_ref) self.db_trash_list.append(feedback.datasource_ref + ".ext.db") def add_page_done(self, number_page_done: int): # make sure it is thread safe with self.state_lock: self.total_page_done += number_page_done time.sleep(0.001) def site_finished(self): # print("one more site done") with self.state_lock: self.job_done += 1 self.average_page_per_site = self.total_page_done / self.job_done time.sleep(0.001) def set_stop(self): self.stop_event.set() def can_continue(self): return not self.stop_event.is_set() def checking_whois(self): optinmal = self.max_prcess * self.concurrent_page / 5 if optinmal < 10: worker_number = 10 else: worker_number = int(optinmal) mem_limit = self.memory_limit_per_process / 2 if mem_limit < 200: mem_limit = 200 self.whois_process = MemoryControlPs( whois_process, func_kwargs=WhoisChecker.get_input_parameters( self._whoisQueue, self.outputQueue, self.stop_event, worker_number), mem_limit=mem_limit, external_stop_event=self.stop_event) self.whois_process.start() def queue_failure_reset(self): manager, self.outputQueue = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) return self.outputQueue def run(self): # self.set_system_limit() self._create_all_file_dirs() self.whois_queue_process.start() whois_thread = Thread(target=self.checking_whois) trash_clean_thread = Thread(target=self.clear_trash) manager, self.outputQueue = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue, # delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread = outputThread( threadID=0, name=self.threadPrfix + "Output", stop_event=self.stop_event, inputQ=self.outputQueue, delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread.start() trash_clean_thread.start() whois_thread.start() # self.whois_queue_process.start() self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter( full_link="", # this parameter will be updated in self.input_iter max_page=self.max_page_per_site, max_level=self.page_max_level, output_queue=self._whoisQueue, pool_size=self.concurrent_page) self.input_iter.callback = self.process_feedback self.input_iter.Memlimit = self.memory_limit_per_process try: #print("monitor process started: pid: ", os.getpid()) self.pool.imap(site_check_process_iter, self.input_iter, 1) #self.pool.imap_unordered(site_check_process_iter, self.input_iter) while self.can_continue(): time.sleep(0.5) except Exception as ex: msg = "run(), with database: " + self.name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) finally: print("terminate miner!") self.pool.terminate() whois_thread.join() self.whois_queue_process.terminate() self.temp_results.clear() self.site_info.clear() self.finished = True
class MiningMasterController(threading.Thread): def __init__(self, accounts: list = [], ref="", cap_slave=0, cap_slave_process=1, cap_concurrent_page=1, all_job=0, offset=0, max_page_level=100, max_page_limit=1000, loopback_database=False, refresh_rate=10, min_page_count=0, filters=DBFilterCollection(), crawl_matrix=CrawlMatrix(), filtering_only_mode=False, filtering_offset=0, filtering_total=0): """ init a master controller :param ref: dataBase Table reference :param cap: max number of slaves :param all_job: :return: """ print("MiningMasterController.__init__") print("setup data:") if isinstance(accounts, list): print("accounts: ") for item in accounts: print(item) print("ref:", ref) print("cap_slave:", cap_slave) print("cap_slave_process:", cap_slave_process) print("cap_concurrent_page:", cap_concurrent_page) if crawl_matrix is not None: print("crawl matrix:", crawl_matrix.__dict__) threading.Thread.__init__(self) self.state = ServerState.State_Init self.ref = ref # database self.slaves = [] self.auto_scale_slaves(cap_slave) self.cap_slave_process = cap_slave_process # how many process can a slave run, if it is 0, then it will auto scale self.concurrent_page = cap_concurrent_page self.stop_Mining = False self.job_done = 0 self.job_wait = 0 self.job_allocated = 0 self.job_all = all_job self.offset = offset self.max_page_level = max_page_level self.max_page_limit = max_page_limit self.start_time = time.time() self.end_time = time.time() self.loopback_database = False self.refresh_rate = refresh_rate self.in_progress = False self.min_page_count = min_page_count # only crawl sites with page greater than this number self.db_seed = None if filters is None: self.db_filters = DBFilterCollection() self.db_filters.external_filter.update_interval = 30 self.db_filters.filtered_result.update_interval = 30 self.db_filters.seed_filter.update_interval = 1200 else: self.db_filters = filters self.filter_shadow = filters.copy_attrs() self.db_stats = [] self.seed_db_update_time = time.time() self.external_db_update_time = time.time() self.filtered_db_update_time = time.time() self.db_update_lock = threading.RLock() self._seed_db_lock = RLock() self._external_db_lock = RLock() self._result_db_lock = RLock() self._result_bad_db_lock = RLock() self._redemption_db_lock = RLock() self.update_db_stats(force_update=True) self._stop_event = Event() #this is for filters self._filter_input_queue = Queue() self._filter_output_queue = Queue() self.filter_process = None if isinstance(crawl_matrix, CrawlMatrix) and crawl_matrix.tf == 0: self._filter_matrix = CrawlMatrix(tf=15, cf=15, da=15, ref_domains=10, tf_cf_deviation=0.80) else: self._filter_matrix = crawl_matrix self._accounts = accounts self._filtering_only = filtering_only_mode self._filtering_offset = filtering_offset self._filtering_total = filtering_total def update_db_stats(self, force_update=False): print("update db stats, do not interrupt!") if self.filter_shadow is not None: names = SiteSource.get_all_table_names(SiteSource.Seed) if len(names) > 0: databases = [] fil = self.filter_shadow if force_update: for name in names: if name is not None and len(name) > 0: with self._seed_db_lock: seed = SeedSiteDB(name, db_filter=fil.seed_filter) seed_count = seed.site_count() seed.close() with self._external_db_lock: external = ExternalSiteDB( name, db_filter=fil.external_filter) external_count = external.site_count() external.close() with self._result_db_lock: filtered = FilteredResultDB( name, db_filter=fil.filtered_result) filtered_count = filtered.site_count() filtered.close() with self._result_bad_db_lock: filtered_bad = FilteredResultDB( name, bad_db=True, db_filter=fil.filtered_result) filtered_count_bad = filtered_bad.site_count() filtered_bad.close() x = DatabaseStatus(name, seed_count, external_count, filtered_count, filtered_count_bad) databases.append(x) self.seed_db_update_time = time.time() self.external_db_update_time = time.time() self.filtered_db_update_time = time.time() self.db_stats = databases #return databases else: time_now = time.time() if len(self.db_stats) == 0: for name in names: self.db_stats.append(DatabaseStatus(name=name)) else: dying_db = [ x for x in self.db_stats if x.name not in names ] for item in dying_db: self.db_stats.remove(item) external_need_update = True if time_now - self.external_db_update_time > fil.external_filter.update_interval else False if external_need_update: self.external_db_update_time = time.time() seed_need_update = True if time_now - self.seed_db_update_time > fil.seed_filter.update_interval else False if seed_need_update: self.seed_db_update_time = time.time() filterd_need_update = True if time_now - self.filtered_db_update_time > fil.filtered_result.update_interval else False if filterd_need_update: self.filtered_db_update_time = time.time() for name in names: # update stats db_s = next( (x for x in self.db_stats if name == x.name), None) if db_s is None and len(name) > 0: db_s = DatabaseStatus(name) self.db_stats.append(db_s) if db_s is not None: if seed_need_update: seed = SeedSiteDB(name, db_filter=fil.seed_filter) db_s.seeds = seed.site_count() seed.close() if external_need_update: external = ExternalSiteDB( name, db_filter=fil.external_filter) db_s.results = external.site_count() external.close() if filterd_need_update: filtered = FilteredResultDB( name, db_filter=fil.filtered_result) db_s.filtered = filtered.site_count() filtered.close() filtered_bad = FilteredResultDB( name, bad_db=True, db_filter=fil.filtered_result) db_s.bad_filtered = filtered_bad.site_count() filtered_bad.close() #return self.db_stats else: pass #return [] else: pass #return [] print("update db stats completed") def remove_db(self, db_type: str, db_name: str): if db_type == DBType.Type_All: with self._seed_db_lock: seed = SeedSiteDB(db_name) seed.drop_table() seed.close() with self._external_db_lock: external = ExternalSiteDB(db_name) external.drop_table() external.close() with self._result_db_lock: filtered = FilteredResultDB(db_name) filtered.drop_table() filtered.close() with self._result_bad_db_lock: filtered_bad = FilteredResultDB(db_name, bad_db=True) filtered_bad.drop_table() filtered_bad.close() elif db_type == DBType.Type_External: with self._external_db_lock: external = ExternalSiteDB(db_name) external.drop_table() external.close() elif db_type == DBType.Type_Filtered_Result: with self._result_db_lock: filtered = FilteredResultDB(db_name) filtered.drop_table() filtered.close() elif db_type == DBType.Type_Filtered_Result_Bad: with self._result_bad_db_lock: filtered_bad = FilteredResultDB(db_name, bad_db=True) filtered_bad.drop_table() filtered_bad.close() self.update_db_stats(force_update=True) def add_seeds(self, seed): if isinstance(seed, MiningList): try: with self._seed_db_lock: db = SeedSiteDB(seed.ref) db.add_sites(seed.data) db.close() self.update_db_stats(force_update=True) except Exception as ex: ErrorLogger.log_error("MiningMasterController.add_seeds()", ex, seed.ref) def get_db_stats(self): #print("copy db stats and send back") stats = MiningList(self.ref, self.db_stats) stats_copy = stats.copy_attrs() print("copy db stats completed") return stats_copy def get_filter_progress(self): if isinstance(self.filter_process, MemoryControlPs): state = self.filter_process.get_last_state() if isinstance(state, _FilterState): return state.progress, state.all_data else: return 0, 0 else: return 0, 0 def clear_host_cache(self): try: FileHandler.clear_dir(get_log_dir()) FileHandler.clear_dir(get_recovery_dir_path()) FileHandler.clear_dir(get_task_backup_dir()) FileHandler.clear_dir(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("MiningControllers", ex, "clear_host_cache()") def clear_slave_cache(self): if self.state == ServerState.State_Idle: threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append( MiningController(slave, cmd=ServerCommand.Com_Clear_Cache)) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() def get_db_seed(self): return SeedSiteDB(table=self.ref, offset=0, db_filter=self.db_filters.seed_filter) def get_db_external(self): return ExternalSiteDB(table=self.ref, offset=0, db_filter=self.db_filters.external_filter) def get_db_filtered(self): return FilteredResultDB(table=self.ref, offset=0, db_filter=self.db_filters.filtered_result) def get_db_filtered_bad(self): return FilteredResultDB(table=self.ref, offset=0, bad_db=True, db_filter=self.db_filters.filtered_result) def get_db_redemption(self): return ExternalSiteDB(table="temp", db_addr=get_temp_db_dir() + "Redemption.db") def get_db_results(self, db_type: str, db_name: str, index: int, length: int) -> MiningList: try: if db_type == DBType.Type_Filtered_Result: with self._result_db_lock: db = FilteredResultDB(db_name, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() elif db_type == DBType.Type_Filtered_Result_Bad: with self._result_bad_db_lock: db = FilteredResultDB(db_name, bad_db=True, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() elif db_type == DBType.Type_External: with self._external_db_lock: db = ExternalSiteDB(db_name, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() elif db_type == DBType.Type_Seed: with self._seed_db_lock: db = SeedSiteDB(db_name, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() else: data = [] result = MiningList(db_name, data) return result except Exception as ex: ErrorLogger.log_error("MiningController.get_db_results()", ex, db_name + " type:" + db_type) return MiningList(db_name, []) def auto_scale_slaves(self, count: int): pass def stop_all_slave(self): threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append( MiningController(slave, cmd=ServerCommand.Com_Stop_Mining)) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() def setup_minging_slaves( self): # salve should restart based on this new settings print("setup_minging_slaves....") threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append( MiningController(slave, cmd=ServerCommand.Com_Setup, in_data=SetupData( self.name, cap2=self.cap_slave_process, cap3=self.concurrent_page, max_page_level=self.max_page_level, max_page_limit=self.max_page_limit))) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() print("setup_minging_slaves completed") def check_slaves_status(self, timeout=15): threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append( MiningController(slave, cmd=ServerCommand.Com_Status)) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(timeout) threads.clear() total_done = 0 # update number of job done, and job wait wait = 0 for slave in self.slaves: if isinstance(slave, Server): print(slave.status) total_done += slave.status.done_job wait += slave.status.wait_job if total_done > self.job_done: self.job_done = total_done self.job_wait = wait @staticmethod def is_in_list(data, target_list: []): if len(target_list): target = next((x for x in target_list if data.domain == x.domain), None) return True if target is not None else False else: return False def get_slaves_result(self) -> []: threads = [] result = [] resultList = [] for slave in self.slaves: if isinstance(slave, Server): if isinstance(slave.status, dict): print("in get_slaves_result, data type is wrong") print(slave.status) slave.status = Serializable.get_deserialized(slave.status) elif slave.status is not None and slave.status.result > 0: result_holder = [] threads.append( MiningController(slave, cmd=ServerCommand.Com_Get_Data, out_data=result_holder)) result.append(result_holder) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() if len(result) > 0: for item in result: if isinstance(item, list) and len(item) > 0 and hasattr( item[0], "data"): # data is the attribute of MiningList data = getattr(item[0], "data") resultList += data return resultList def is_in_slave_list(self, addr: str) -> True: if addr != "": match = next((x for x in self.slaves if x.address.address == addr), None) return True if match is not None else False else: return False def add_slaves(self, slaves: []): print("adding slave...") if slaves is not None: for slave in slaves: if isinstance(slave, ServerAddress): if slave.address == "localhost": slave.address = "127.0.0.1" if self.is_in_slave_list(slave.address): continue ser = Server(server_type=ServerType.ty_MiningSlaveSmall, address=slave) self.slaves.append(ser) elif isinstance(slave, str): print("adding slave:", slave) temp = slave if slave == "localhost": temp = "127.0.0.1" if self.is_in_slave_list(slave): continue ser = Server(server_type=ServerType.ty_MiningSlaveSmall, address=ServerAddress( temp, MiningTCPServer.DefaultListenPort)) self.slaves.append(ser) print("adding slave finished.") def remove_slaves(self, slaves: []): if slaves is not None: for slave in slaves: found = None if isinstance(slave, ServerAddress): found = next(x for x in self.slaves if x.address.address == slave.address) elif isinstance(slave, str): found = next(x for x in self.slaves if x.address.address == slave) if found is not None: self.slaves.remove(found) def get_slaves(self): return self.slaves def allocate_task(self): try: threads = [] with self._seed_db_lock: db_seed = SeedSiteDB(offset=self.offset, table=self.ref, db_filter=self.db_filters.seed_filter) for slave in self.slaves: if isinstance(slave, Server) and slave.status is not None: if isinstance(slave.status, dict): print( "in allocate_task, data type is invalid, the following data was received, need to redo" ) print(slave.status) slave.status = Serializable.get_deserialized( slave.status) #raise ValueError("slave status is not valid data type") if slave.status.is_server_down(): print("server is down, continue..") continue if slave.status.all_job - slave.status.done_job <= slave.status.cap_process * 4: job_temp = int( slave.status.cap_process ) # give half an 1/4 hour worth of data if job_temp < 5: # give minimum of 5 jobs job_temp = 5 if not self.loopback_database and job_temp + self.job_allocated > self.job_all: job_temp = self.job_all - self.job_allocated sites = db_seed.get_next_patch( count=job_temp, rollover=self.loopback_database) print("allocate task:", len(sites)) self.job_allocated += len(sites) ref = db_seed.tab mlist = MiningList(ref, sites) number_sites = len(sites) if number_sites > 0: # try: # CsvLogger.log_to_file(slave.address.address, [(link,) for link in sites], # dir_path=get_task_backup_dir()) # except: # pass # self.offset += number_sites t = MiningController( slave, cmd=ServerCommand.Com_Data, in_data=mlist) try: t.start() t.join(30) self.offset += number_sites except Exception as inner_ex: print(inner_ex) else: return db_seed.close() # if len(threads) > 0: # for thread in threads: # thread.start() # for thread in threads: # thread.join() # threads.clear() except Exception as ex: ErrorLogger.log_error("MiningMasterController.allocate_task()", ex, self.ref) def allocate_task_v1(self): try: threads = [] with self._seed_db_lock: db_seed = SeedSiteDB(offset=self.offset, table=self.ref, db_filter=self.db_filters.seed_filter) for slave in self.slaves: if isinstance(slave, Server) and slave.status is not None: if isinstance(slave.status, dict): print( "in allocate_task, data type is invalid, the following data was received, need to redo" ) print(slave.status) slave.status = Serializable.get_deserialized( slave.status) #raise ValueError("slave status is not valid data type") if slave.status.all_job - slave.status.done_job <= slave.status.cap_process + 2 > 2: job_temp = int( slave.status.cap_process / 2) # give half an 1/4 hour worth of data if job_temp < 5: # give minimum of 5 jobs job_temp = 5 if not self.loopback_database and job_temp + self.job_allocated > self.job_all: job_temp = self.job_all - self.job_allocated sites = db_seed.get_next_patch( count=job_temp, rollover=self.loopback_database) print("allocate task:") print(sites) self.job_allocated += len(sites) ref = db_seed.tab mlist = MiningList(ref, sites) number_sites = len(sites) if number_sites > 0: try: CsvLogger.log_to_file( slave.address.address, [(link, ) for link in sites], dir_path=get_task_backup_dir()) except: pass self.offset += number_sites threads.append( MiningController( slave, cmd=ServerCommand.Com_Data, in_data=mlist)) else: return db_seed.close() if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() except Exception as ex: ErrorLogger.log_error("MiningMasterController.allocate_task()", ex, self.ref) def process_filtering_output_results(self): results = [] bad_results = [] tuples = [] while not self._filter_output_queue.empty(): item = self._filter_output_queue.get() if isinstance(item, FilteredDomainData): if len(item.exception) > 0: bad_results.append(item) else: results.append(item) tuples.append(item.to_tuple()) if len(results) > 0: try: with self._result_db_lock: db = self.get_db_filtered() db.add_sites(results, skip_check=False) db.close() except Exception as ex: ErrorLogger.log_error( "MingingMasterController", ex, "process_filtering_output_results() " + self.ref) finally: CsvLogger.log_to_file("filtered_domains.csv", tuples) if len(bad_results) > 0: try: with self._result_bad_db_lock: bad_db = self.get_db_filtered_bad() bad_db.add_sites(bad_results, skip_check=False) bad_db.close() except Exception as ex: ErrorLogger.log_error( "MingingMasterController", ex, "process_filtering_output_results() " + self.ref) finally: CsvLogger.log_to_file("filtered_domains.csv", tuples) def process_result(self, result: []): if result is not None and len(result) > 0: print("processing external site and seeds results") external = [] sitesfeedback = [] redemption_list = [] try: for item in result: #print("item: ", str(item.__dict__)) if isinstance(item, ScrapeDomainData): #print(item) #if not MiningMasterController.is_in_list(item, external) and not all_external.is_domain_in_db(item.domain): raw_data = (item.domain, item.code) if item.code == ResponseCode.MightBeExpired: redemption_list.append(raw_data) else: external.append(raw_data) self._filter_input_queue.put( raw_data) # also put into filtering queue elif isinstance(item, SeedSiteFeedback): #print("udpate:", str(item.__dict__)) sitesfeedback.append(item) else: continue with self._external_db_lock: all_external = self.get_db_external() all_external.add_sites(external, True) all_external.close() with self._redemption_db_lock: redemption_db = self.get_db_redemption() redemption_db.add_sites(redemption_list, True) redemption_db.close() with self._seed_db_lock: seed_sites = self.get_db_seed() seed_sites.update_sites(sitesfeedback) seed_sites.close() except Exception as ex: ErrorLogger.log_error("MingingMasterController", ex, "process_result() " + self.ref) def pause(self): self.in_progress = False self.stop_all_slave() self.stop_Mining = True def stop(self): print("external set to stop!") self._stop_event.set() self.in_progress = False self.stop_all_slave() def continue_work(self): self.stop_Mining = False def _filtering_process_wrapper(self): self.filter_process = MemoryControlPs( func=filtering_process, func_kwargs=FilterController.get_input_parameters( "filtering.db", get_recovery_dir_path(), self._filter_input_queue, self._filter_output_queue, self._stop_event, self._filter_matrix, self._accounts, self._filtering_only, self._filtering_offset, self._filtering_total), external_stop_event=self._stop_event) self.filter_process.start() def run( self ): # this is the nornal routine, should setup slaves before doing this filter_t = threading.Thread(target=self._filtering_process_wrapper) if len(self.slaves) > 0 or self._filtering_only: filter_t.start() while not self._stop_event.is_set(): self.state = ServerState.State_Idle print("check status") self.check_slaves_status() #time.sleep(1) if not self.stop_Mining and (len(self.slaves) > 0 or self._filtering_only): self.state = ServerState.State_Active if not self._filtering_only: print("allocate task") self.allocate_task() #time.sleep(1) print("get and process results") result = self.get_slaves_result() self.process_result(result) result.clear() self.process_filtering_output_results( ) # get filtered result into Filtered DB if (self.loopback_database or self.job_done < self.job_all) and len(self.slaves) > 0: self.end_time = time.time() self.in_progress = True elif self._filtering_only: self.end_time = time.time() self.in_progress = True else: self.in_progress = False print("finished getting results") self.update_db_stats() print("update db finished") if self._stop_event.is_set(): break time.sleep(15) print("should finish filtering process!") if filter_t.is_alive(): filter_t.join() print("master server shut down!")
class MiningMasterController(threading.Thread): def __init__(self, accounts: list=[], ref="", cap_slave=0, cap_slave_process=1, cap_concurrent_page=1, all_job=0, offset = 0, max_page_level=100, max_page_limit=1000, loopback_database=False, refresh_rate=10, min_page_count=0, filters=DBFilterCollection(), crawl_matrix=CrawlMatrix(), filtering_only_mode=False, filtering_offset=0, filtering_total=0): """ init a master controller :param ref: dataBase Table reference :param cap: max number of slaves :param all_job: :return: """ print("MiningMasterController.__init__") print("setup data:") if isinstance(accounts, list): print("accounts: ") for item in accounts: print(item) print("ref:", ref) print("cap_slave:", cap_slave) print("cap_slave_process:", cap_slave_process) print("cap_concurrent_page:", cap_concurrent_page) if crawl_matrix is not None: print("crawl matrix:", crawl_matrix.__dict__) threading.Thread.__init__(self) self.state = ServerState.State_Init self.ref = ref # database self.slaves = [] self.auto_scale_slaves(cap_slave) self.cap_slave_process = cap_slave_process # how many process can a slave run, if it is 0, then it will auto scale self.concurrent_page = cap_concurrent_page self.stop_Mining = False self.job_done = 0 self.job_wait = 0 self.job_allocated = 0 self.job_all = all_job self.offset = offset self.max_page_level = max_page_level self.max_page_limit = max_page_limit self.start_time = time.time() self.end_time = time.time() self.loopback_database = False self.refresh_rate = refresh_rate self.in_progress = False self.min_page_count = min_page_count # only crawl sites with page greater than this number self.db_seed = None if filters is None: self.db_filters = DBFilterCollection() self.db_filters.external_filter.update_interval = 30 self.db_filters.filtered_result.update_interval = 30 self.db_filters.seed_filter.update_interval = 1200 else: self.db_filters = filters self.filter_shadow = filters.copy_attrs() self.db_stats = [] self.seed_db_update_time = time.time() self.external_db_update_time = time.time() self.filtered_db_update_time = time.time() self.db_update_lock = threading.RLock() self._seed_db_lock = RLock() self._external_db_lock = RLock() self._result_db_lock = RLock() self._result_bad_db_lock = RLock() self._redemption_db_lock = RLock() self.update_db_stats(force_update=True) self._stop_event = Event() #this is for filters self._filter_input_queue = Queue() self._filter_output_queue = Queue() self.filter_process = None if isinstance(crawl_matrix, CrawlMatrix) and crawl_matrix.tf == 0: self._filter_matrix = CrawlMatrix(tf=15, cf=15, da=15, ref_domains=10, tf_cf_deviation=0.80) else: self._filter_matrix = crawl_matrix self._accounts = accounts self._filtering_only = filtering_only_mode self._filtering_offset = filtering_offset self._filtering_total = filtering_total def update_db_stats(self, force_update=False): print("update db stats, do not interrupt!") if self.filter_shadow is not None: names = SiteSource.get_all_table_names(SiteSource.Seed) if len(names) > 0: databases = [] fil = self.filter_shadow if force_update: for name in names: if name is not None and len(name) > 0: with self._seed_db_lock: seed = SeedSiteDB(name, db_filter=fil.seed_filter) seed_count = seed.site_count() seed.close() with self._external_db_lock: external = ExternalSiteDB(name, db_filter=fil.external_filter) external_count = external.site_count() external.close() with self._result_db_lock: filtered = FilteredResultDB(name, db_filter=fil.filtered_result) filtered_count = filtered.site_count() filtered.close() with self._result_bad_db_lock: filtered_bad = FilteredResultDB(name, bad_db=True, db_filter=fil.filtered_result) filtered_count_bad = filtered_bad.site_count() filtered_bad.close() x = DatabaseStatus(name, seed_count, external_count, filtered_count, filtered_count_bad) databases.append(x) self.seed_db_update_time = time.time() self.external_db_update_time = time.time() self.filtered_db_update_time = time.time() self.db_stats = databases #return databases else: time_now = time.time() if len(self.db_stats) == 0: for name in names: self.db_stats.append(DatabaseStatus(name=name)) else: dying_db = [x for x in self.db_stats if x.name not in names] for item in dying_db: self.db_stats.remove(item) external_need_update = True if time_now - self.external_db_update_time > fil.external_filter.update_interval else False if external_need_update: self.external_db_update_time = time.time() seed_need_update = True if time_now - self.seed_db_update_time > fil.seed_filter.update_interval else False if seed_need_update: self.seed_db_update_time = time.time() filterd_need_update = True if time_now - self.filtered_db_update_time > fil.filtered_result.update_interval else False if filterd_need_update: self.filtered_db_update_time = time.time() for name in names: # update stats db_s = next((x for x in self.db_stats if name == x.name), None) if db_s is None and len(name) > 0: db_s = DatabaseStatus(name) self.db_stats.append(db_s) if db_s is not None: if seed_need_update: seed = SeedSiteDB(name, db_filter=fil.seed_filter) db_s.seeds = seed.site_count() seed.close() if external_need_update: external = ExternalSiteDB(name, db_filter=fil.external_filter) db_s.results = external.site_count() external.close() if filterd_need_update: filtered = FilteredResultDB(name, db_filter=fil.filtered_result) db_s.filtered = filtered.site_count() filtered.close() filtered_bad = FilteredResultDB(name, bad_db=True, db_filter=fil.filtered_result) db_s.bad_filtered = filtered_bad.site_count() filtered_bad.close() #return self.db_stats else: pass #return [] else: pass #return [] print("update db stats completed") def remove_db(self, db_type: str, db_name: str): if db_type == DBType.Type_All: with self._seed_db_lock: seed = SeedSiteDB(db_name) seed.drop_table() seed.close() with self._external_db_lock: external = ExternalSiteDB(db_name) external.drop_table() external.close() with self._result_db_lock: filtered = FilteredResultDB(db_name) filtered.drop_table() filtered.close() with self._result_bad_db_lock: filtered_bad = FilteredResultDB(db_name, bad_db=True) filtered_bad.drop_table() filtered_bad.close() elif db_type == DBType.Type_External: with self._external_db_lock: external = ExternalSiteDB(db_name) external.drop_table() external.close() elif db_type == DBType.Type_Filtered_Result: with self._result_db_lock: filtered = FilteredResultDB(db_name) filtered.drop_table() filtered.close() elif db_type == DBType.Type_Filtered_Result_Bad: with self._result_bad_db_lock: filtered_bad = FilteredResultDB(db_name, bad_db=True) filtered_bad.drop_table() filtered_bad.close() self.update_db_stats(force_update=True) def add_seeds(self, seed): if isinstance(seed, MiningList): try: with self._seed_db_lock: db = SeedSiteDB(seed.ref) db.add_sites(seed.data) db.close() self.update_db_stats(force_update=True) except Exception as ex: ErrorLogger.log_error("MiningMasterController.add_seeds()", ex, seed.ref) def get_db_stats(self): #print("copy db stats and send back") stats = MiningList(self.ref, self.db_stats) stats_copy = stats.copy_attrs() print("copy db stats completed") return stats_copy def get_filter_progress(self): if isinstance(self.filter_process, MemoryControlPs): state = self.filter_process.get_last_state() if isinstance(state, _FilterState): return state.progress, state.all_data else: return 0, 0 else: return 0, 0 def clear_host_cache(self): try: FileHandler.clear_dir(get_log_dir()) FileHandler.clear_dir(get_recovery_dir_path()) FileHandler.clear_dir(get_task_backup_dir()) FileHandler.clear_dir(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("MiningControllers", ex, "clear_host_cache()") def clear_slave_cache(self): if self.state == ServerState.State_Idle: threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append(MiningController(slave, cmd=ServerCommand.Com_Clear_Cache)) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() def get_db_seed(self): return SeedSiteDB(table=self.ref, offset=0, db_filter=self.db_filters.seed_filter) def get_db_external(self): return ExternalSiteDB(table=self.ref, offset=0, db_filter=self.db_filters.external_filter) def get_db_filtered(self): return FilteredResultDB(table=self.ref, offset=0, db_filter=self.db_filters.filtered_result) def get_db_filtered_bad(self): return FilteredResultDB(table=self.ref, offset=0, bad_db=True, db_filter=self.db_filters.filtered_result) def get_db_redemption(self): return ExternalSiteDB(table="temp", db_addr=get_temp_db_dir()+"Redemption.db") def get_db_results(self, db_type: str, db_name: str, index: int, length: int) -> MiningList: try: if db_type == DBType.Type_Filtered_Result: with self._result_db_lock: db = FilteredResultDB(db_name, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() elif db_type == DBType.Type_Filtered_Result_Bad: with self._result_bad_db_lock: db = FilteredResultDB(db_name, bad_db=True, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() elif db_type == DBType.Type_External: with self._external_db_lock: db = ExternalSiteDB(db_name, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() elif db_type == DBType.Type_Seed: with self._seed_db_lock: db = SeedSiteDB(db_name, offset=index) data = db.get_next_patch(count=length, rollover=False) db.close() else: data = [] result = MiningList(db_name, data) return result except Exception as ex: ErrorLogger.log_error("MiningController.get_db_results()", ex, db_name + " type:" + db_type) return MiningList(db_name, []) def auto_scale_slaves(self, count: int): pass def stop_all_slave(self): threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append(MiningController(slave, cmd=ServerCommand.Com_Stop_Mining)) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() def setup_minging_slaves(self): # salve should restart based on this new settings print("setup_minging_slaves....") threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append(MiningController(slave, cmd=ServerCommand.Com_Setup, in_data=SetupData(self.name, cap2=self.cap_slave_process, cap3=self.concurrent_page, max_page_level=self.max_page_level, max_page_limit=self.max_page_limit))) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() print("setup_minging_slaves completed") def check_slaves_status(self, timeout=15): threads = [] for slave in self.slaves: if isinstance(slave, Server): threads.append(MiningController(slave, cmd=ServerCommand.Com_Status)) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(timeout) threads.clear() total_done = 0 # update number of job done, and job wait wait = 0 for slave in self.slaves: if isinstance(slave, Server): print(slave.status) total_done += slave.status.done_job wait += slave.status.wait_job if total_done > self.job_done: self.job_done = total_done self.job_wait = wait @staticmethod def is_in_list(data, target_list: []): if len(target_list): target = next((x for x in target_list if data.domain == x.domain), None) return True if target is not None else False else: return False def get_slaves_result(self) -> []: threads = [] result = [] resultList = [] for slave in self.slaves: if isinstance(slave, Server): if isinstance(slave.status, dict): print("in get_slaves_result, data type is wrong") print(slave.status) slave.status = Serializable.get_deserialized(slave.status) elif slave.status is not None and slave.status.result > 0: result_holder = [] threads.append(MiningController(slave, cmd=ServerCommand.Com_Get_Data, out_data=result_holder)) result.append(result_holder) if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() if len(result) > 0: for item in result: if isinstance(item, list) and len(item) > 0 and hasattr(item[0], "data"): # data is the attribute of MiningList data = getattr(item[0], "data") resultList += data return resultList def is_in_slave_list(self, addr: str) -> True: if addr != "": match = next((x for x in self.slaves if x.address.address == addr), None) return True if match is not None else False else: return False def add_slaves(self, slaves: []): print("adding slave...") if slaves is not None: for slave in slaves: if isinstance(slave, ServerAddress): if slave.address == "localhost": slave.address = "127.0.0.1" if self.is_in_slave_list(slave.address): continue ser = Server(server_type=ServerType.ty_MiningSlaveSmall, address=slave) self.slaves.append(ser) elif isinstance(slave, str): print("adding slave:", slave) temp = slave if slave == "localhost": temp = "127.0.0.1" if self.is_in_slave_list(slave): continue ser = Server(server_type=ServerType.ty_MiningSlaveSmall, address=ServerAddress(temp, MiningTCPServer.DefaultListenPort)) self.slaves.append(ser) print("adding slave finished.") def remove_slaves(self, slaves: []): if slaves is not None: for slave in slaves: found = None if isinstance(slave, ServerAddress): found = next(x for x in self.slaves if x.address.address == slave.address) elif isinstance(slave, str): found = next(x for x in self.slaves if x.address.address == slave) if found is not None: self.slaves.remove(found) def get_slaves(self): return self.slaves def allocate_task(self): try: threads = [] with self._seed_db_lock: db_seed = SeedSiteDB(offset=self.offset, table=self.ref, db_filter=self.db_filters.seed_filter) for slave in self.slaves: if isinstance(slave, Server) and slave.status is not None: if isinstance(slave.status, dict): print("in allocate_task, data type is invalid, the following data was received, need to redo") print(slave.status) slave.status = Serializable.get_deserialized(slave.status) #raise ValueError("slave status is not valid data type") if slave.status.is_server_down(): print("server is down, continue..") continue if slave.status.all_job - slave.status.done_job <= slave.status.cap_process * 4: job_temp = int(slave.status.cap_process) # give half an 1/4 hour worth of data if job_temp < 5: # give minimum of 5 jobs job_temp = 5 if not self.loopback_database and job_temp + self.job_allocated > self.job_all: job_temp = self.job_all - self.job_allocated sites = db_seed.get_next_patch(count=job_temp, rollover=self.loopback_database) print("allocate task:", len(sites)) self.job_allocated += len(sites) ref = db_seed.tab mlist = MiningList(ref, sites) number_sites = len(sites) if number_sites > 0: # try: # CsvLogger.log_to_file(slave.address.address, [(link,) for link in sites], # dir_path=get_task_backup_dir()) # except: # pass # self.offset += number_sites t = MiningController(slave, cmd=ServerCommand.Com_Data, in_data=mlist) try: t.start() t.join(30) self.offset += number_sites except Exception as inner_ex: print(inner_ex) else: return db_seed.close() # if len(threads) > 0: # for thread in threads: # thread.start() # for thread in threads: # thread.join() # threads.clear() except Exception as ex: ErrorLogger.log_error("MiningMasterController.allocate_task()", ex, self.ref) def allocate_task_v1(self): try: threads = [] with self._seed_db_lock: db_seed = SeedSiteDB(offset=self.offset, table=self.ref, db_filter=self.db_filters.seed_filter) for slave in self.slaves: if isinstance(slave, Server) and slave.status is not None: if isinstance(slave.status, dict): print("in allocate_task, data type is invalid, the following data was received, need to redo") print(slave.status) slave.status = Serializable.get_deserialized(slave.status) #raise ValueError("slave status is not valid data type") if slave.status.all_job - slave.status.done_job <= slave.status.cap_process + 2 > 2: job_temp = int(slave.status.cap_process/2) # give half an 1/4 hour worth of data if job_temp < 5: # give minimum of 5 jobs job_temp = 5 if not self.loopback_database and job_temp + self.job_allocated > self.job_all: job_temp = self.job_all - self.job_allocated sites = db_seed.get_next_patch(count=job_temp, rollover=self.loopback_database) print("allocate task:") print(sites) self.job_allocated += len(sites) ref = db_seed.tab mlist = MiningList(ref, sites) number_sites = len(sites) if number_sites > 0: try: CsvLogger.log_to_file(slave.address.address, [(link,) for link in sites], dir_path=get_task_backup_dir()) except: pass self.offset += number_sites threads.append(MiningController(slave, cmd=ServerCommand.Com_Data, in_data=mlist)) else: return db_seed.close() if len(threads) > 0: for thread in threads: thread.start() for thread in threads: thread.join(30) threads.clear() except Exception as ex: ErrorLogger.log_error("MiningMasterController.allocate_task()", ex, self.ref) def process_filtering_output_results(self): results = [] bad_results = [] tuples = [] while not self._filter_output_queue.empty(): item = self._filter_output_queue.get() if isinstance(item, FilteredDomainData): if len(item.exception) > 0: bad_results.append(item) else: results.append(item) tuples.append(item.to_tuple()) if len(results) > 0: try: with self._result_db_lock: db = self.get_db_filtered() db.add_sites(results, skip_check=False) db.close() except Exception as ex: ErrorLogger.log_error("MingingMasterController", ex, "process_filtering_output_results() " + self.ref) finally: CsvLogger.log_to_file("filtered_domains.csv", tuples) if len(bad_results) > 0: try: with self._result_bad_db_lock: bad_db = self.get_db_filtered_bad() bad_db.add_sites(bad_results, skip_check=False) bad_db.close() except Exception as ex: ErrorLogger.log_error("MingingMasterController", ex, "process_filtering_output_results() " + self.ref) finally: CsvLogger.log_to_file("filtered_domains.csv", tuples) def process_result(self, result: []): if result is not None and len(result) > 0: print("processing external site and seeds results") external = [] sitesfeedback = [] redemption_list = [] try: for item in result: #print("item: ", str(item.__dict__)) if isinstance(item, ScrapeDomainData): #print(item) #if not MiningMasterController.is_in_list(item, external) and not all_external.is_domain_in_db(item.domain): raw_data = (item.domain, item.code) if item.code == ResponseCode.MightBeExpired: redemption_list.append(raw_data) else: external.append(raw_data) self._filter_input_queue.put(raw_data) # also put into filtering queue elif isinstance(item, SeedSiteFeedback): #print("udpate:", str(item.__dict__)) sitesfeedback.append(item) else: continue with self._external_db_lock: all_external = self.get_db_external() all_external.add_sites(external, True) all_external.close() with self._redemption_db_lock: redemption_db = self.get_db_redemption() redemption_db.add_sites(redemption_list, True) redemption_db.close() with self._seed_db_lock: seed_sites = self.get_db_seed() seed_sites.update_sites(sitesfeedback) seed_sites.close() except Exception as ex: ErrorLogger.log_error("MingingMasterController", ex, "process_result() " + self.ref) def pause(self): self.in_progress = False self.stop_all_slave() self.stop_Mining = True def stop(self): print("external set to stop!") self._stop_event.set() self.in_progress = False self.stop_all_slave() def continue_work(self): self.stop_Mining = False def _filtering_process_wrapper(self): self.filter_process = MemoryControlPs(func=filtering_process, func_kwargs=FilterController.get_input_parameters("filtering.db", get_recovery_dir_path(), self._filter_input_queue, self._filter_output_queue, self._stop_event, self._filter_matrix, self._accounts, self._filtering_only, self._filtering_offset, self._filtering_total), external_stop_event=self._stop_event) self.filter_process.start() def run(self): # this is the nornal routine, should setup slaves before doing this filter_t = threading.Thread(target=self._filtering_process_wrapper) if len(self.slaves) > 0 or self._filtering_only: filter_t.start() while not self._stop_event.is_set(): self.state = ServerState.State_Idle print("check status") self.check_slaves_status() #time.sleep(1) if not self.stop_Mining and (len(self.slaves) > 0 or self._filtering_only): self.state = ServerState.State_Active if not self._filtering_only: print("allocate task") self.allocate_task() #time.sleep(1) print("get and process results") result = self.get_slaves_result() self.process_result(result) result.clear() self.process_filtering_output_results() # get filtered result into Filtered DB if (self.loopback_database or self.job_done < self.job_all) and len(self.slaves) > 0: self.end_time = time.time() self.in_progress = True elif self._filtering_only: self.end_time = time.time() self.in_progress = True else: self.in_progress = False print("finished getting results") self.update_db_stats() print("update db finished") if self._stop_event.is_set(): break time.sleep(15) print("should finish filtering process!") if filter_t.is_alive(): filter_t.join() print("master server shut down!")