def get_recovery_dir_path(ref: str = ""): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/Recovery/" else: temp_loc = "/usr/local/DomainFinder/Recovery/" return temp_loc + ref
def get_proxy_file_path(): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/Proxy/proxy_list.csv" else: temp_loc = "/usr/local/DomainFinder/Database/proxy_list.csv" return temp_loc
def get_marketplace_db_path(ref: str=""): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/Marketplace/" else: temp_loc = "/usr/local/DomainFinder/Database/Marketplace/" return temp_loc + ref
def get_task_backup_dir(ref: str=""): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/Task/" else: temp_loc = "/usr/local/DomainFinder/Task/" return temp_loc + ref
def get_marketplace_db_path(ref: str = ""): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/Marketplace/" else: temp_loc = "/usr/local/DomainFinder/Database/Marketplace/" return temp_loc + ref
def get_recovery_dir_path(ref: str=""): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/Recovery/" else: temp_loc = "/usr/local/DomainFinder/Recovery/" return temp_loc + ref
def get_task_backup_dir(ref: str = ""): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/Task/" else: temp_loc = "/usr/local/DomainFinder/Task/" return temp_loc + ref
def get_db_buffer_default_dir(): machine_tpye = MachineInfo.get_machine_type() if machine_tpye == MachineType.Windows: temp_loc = "D:/SQLiteDB/Temp/DatabaseBuf/" else: temp_loc = "/tmp/DatabaseBuf/" return temp_loc
def run(self): #print("running memory monitor") while True: if self._stop_event.is_set(): #print("external stop") break else: mem = MachineInfo.get_memory_process(self._pid) #print("process use: ", mem, " MB") if mem > self._mem_limit: self._exceed_limit = True self._callback(True) time.sleep(self._wait)
def get_default_address(source_type: str): DB_prefix = "/usr/local/DomainFinder/Database" # this is for Linux if MachineInfo.get_machine_type() == MachineType.Windows: DB_prefix = "D:/SQLiteDB" if source_type == SiteSource.Seed: return DB_prefix + "/SeedSitesList" elif source_type == SiteSource.AllExternal: return DB_prefix + "/ResultSitesList" elif source_type == SiteSource.Flitered: return DB_prefix + "/FilteredSitesList" elif source_type == SiteSource.Filtered_bad: return DB_prefix + "/FilteredSitesList_Bad" else: return ":memory:"
def get_spam_filter_keywords_file_path(): keyword_file = "keywords.txt" if MachineInfo.get_machine_type() == MachineType.Windows: return 'D:/SQLiteDB/SpamFilter/' + keyword_file else: return '/usr/local/DomainFinder/SpamFilter/' + keyword_file
def get_spam_filter_anchors_file_path(): anchor_file = "anchors.txt" if MachineInfo.get_machine_type() == MachineType.Windows: return 'D:/SQLiteDB/SpamFilter/' + anchor_file else: return '/usr/local/DomainFinder/SpamFilter/' + anchor_file
def get_download_file_path(): if MachineInfo.get_machine_type() == MachineType.Windows: return 'D:/ChromeDownload/' else: return '/tmp/download/'
def get_chrome_exe_path(): if MachineInfo.get_machine_type() == MachineType.Windows: return 'C:/WebDrivers/chromedriver.exe' else: return '/usr/lib/chromium-browser/chromedriver'
def get_optimal_capacity(self) -> int: mem_per_process = self.maxpagelimit * 50/1000 # normally it cost 0.005 MB per page total_mem = MachineInfo.get_memory()[0] return int(total_mem*0.7/mem_per_process)
def get_spam_filter_bad_country_path(): country_file = "bad_country.txt" if MachineInfo.get_machine_type() == MachineType.Windows: return 'D:/SQLiteDB/SpamFilter/' + country_file else: return '/usr/local/DomainFinder/SpamFilter/' + country_file
def get_log_dir(): WIN_PATH = "D:/SQLiteDB/Temp/Logging/" LINUX_PATH = "/tmp/Logging/" machine_type = MachineInfo.get_machine_type() return WIN_PATH if machine_type == MachineType.Windows else LINUX_PATH
def send_and_receive(self): in_buffer = self.rfile out_buffer = self.wfile s = self.server.addtional_obj command = CommandProcessor.receive_command(in_buffer) #print("process cmd: ", command.cmd) if command is not None: reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK) if command.cmd == ServerCommand.Com_Start: #print("start conversation:") CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Stop_Mining: if s is not None and isinstance(s, SiteCheckProcessManager): s.set_stop() CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Setup: # test this data = command.data if isinstance(data, SetupData): cap2 = data.cap2 if cap2 == 0: cap2 = SlaveAutoScaler( data.max_page_level, data.max_page_limit).get_optimal_capacity() if isinstance(s, SiteCheckProcessManager): # need to fix this if s.is_alive(): s.set_stop() s.join() # elif isinstance(s, threading.Thread): # s.join(0) print("init new process manager with para: ", data.get_serializable(False)) total_memory = MachineInfo.get_memory()[0] mem_limit_per_crawler = int(total_memory * 0.8 / cap2) if mem_limit_per_crawler >= SiteCheckProcessManager.MEM_MINIMUM_REQ: self.server.addtional_obj = SiteCheckProcessManager( data.ref, max_procss=cap2, concurrent_page=data.cap3, page_max_level=data.max_page_level, max_page_per_site=data.max_page_limit, memory_limit_per_process=mem_limit_per_crawler) self.server.addtional_obj.start() else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "Not enough memory allocation for each crawler, must at least 100 Mb." CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Clear_Cache: if isinstance(s, SiteCheckProcessManager): # need to fix this if s.is_alive(): s.set_stop() s.join() s.clear_cache() CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Data: print("incoming data....") data = command.data if s is not None and isinstance(s, SiteCheckProcessManager): if data.data is not None and len(data.data) > 0: s.put_to_input_queue(data.data) else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "something is wrong with data" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Get_Data: # test this if s is not None and isinstance(s, SiteCheckProcessManager): rawdata = [] if s.get_temp_result_count() > 0: rawdata += [ ScrapeDomainData(x.link, x.response_code) for x in s.get_temp_result_and_clear() if isinstance(x, OnSiteLink) ] #print("sending back:") #print(rawdata) if s.get_site_info_list_count() > 0: rawdata += [ info for info in s.get_site_info_list_and_clear() if isinstance(info, SeedSiteFeedback) ] data = MiningList(s.name, rawdata) reply.data = data else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "something is wrong with return data" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Status: #print("send back status!") CPU = MachineInfo.get_cpu(1) MEM = MachineInfo.get_memory() NET = MachineInfo.get_network(1) if s is not None and isinstance(s, SiteCheckProcessManager): manager_state = s.get_state() filter_progress = s.get_filter_progress() status = ServerStatus( wait_job=manager_state.job_wait, done_job=manager_state.job_done, all_job=manager_state.job_all, total_page_done=manager_state.total_page_done, page_per_site=manager_state.average_page, result=manager_state.result_count, cpu_cores=CPU[0], cpu_percent=CPU[1], toal_memory=MEM[0], memory_percent=MEM[1], net_recieved=NET[0], net_send=NET[1], cap_slave=1, cap_process=s.max_prcess, cap_concurrent_page=s.concurrent_page, filter_done=filter_progress[0], filter_total=filter_progress[1]) reply.data = status else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "something is wrong with the crawler." CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Stop: #print("end conversation:") return else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "command is not valid, please try again" CommandProcessor.send_command(out_buffer, reply) #print("finished cmd", command.cmd) self.send_and_receive()
def get_default_archive_dir(): DB_prefix = "/usr/local/DomainFinder/Archive/" # this is for Linux if MachineInfo.get_machine_type() == MachineType.Windows: DB_prefix = "D:/SQLiteDB/Archive/" return DB_prefix
def get_temp_db_dir(): DB_prefix = "/usr/local/DomainFinder/Database/" # this is for Linux if MachineInfo.get_machine_type() == MachineType.Windows: DB_prefix = "D:/SQLiteDB/" return DB_prefix
def get_optimal_capacity(self) -> int: mem_per_process = self.maxpagelimit * 50 / 1000 # normally it cost 0.005 MB per page total_mem = MachineInfo.get_memory()[0] return int(total_mem * 0.7 / mem_per_process)
def send_and_receive(self): in_buffer = self.rfile out_buffer = self.wfile s = self.server.addtional_obj command = CommandProcessor.receive_command(in_buffer) #print("process cmd: ", command.cmd) if command is not None and isinstance(s, MiningMasterController): reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK) if command.cmd == ServerCommand.Com_Start: #print("start conversation:") CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Stop: #print("end conversation:") return # exit point elif command.cmd == ServerCommand.Com_Get_DB_DATA: data = command.data if isinstance(data, DBRequestFields): try: reply.data = s.get_db_results(db_type=data.db_type, db_name=data.db_name, index=data.index, length=data.length) except Exception as ex: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com-Get-DB-DATA") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Get DB data failed" else: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", Exception("wrong data type recieved."), "cmd = ServerCommand.Com-Get-DB-DATA") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Get DB data failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Remove_DB: data = command.data if isinstance(data, DBRequestFields): try: s.remove_db(db_type=data.db_type, db_name=data.db_name) except Exception as ex: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_DB-RM-DB") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Remove DB failed" else: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", Exception("wrong data type recieved."), "cmd = ServerCommand.Com_DB-RM-DB") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Remove DB failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Start_Filter: data = command.data try: if s.is_alive(): s.stop() s.join() if isinstance(data, FilteringSetupData): #MatrixFilterControl.FilteringSetupData self.server.addtional_obj = MiningMasterController(ref=data.ref, accounts=data.accounts, crawl_matrix=data.crawl_matrix, filtering_only_mode=True, filtering_offset=data.offset, filtering_total=data.total ) self.server.addtional_obj.start() except Exception as ex: print(ex) ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Start_Filter()") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Com_Start_Filter failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Setup: # test this data = command.data try: if s.is_alive(): s.stop() s.join() if isinstance(data, SetupData): self.server.addtional_obj = MiningMasterController(ref=data.ref, accounts=data.accounts, cap_slave=data.cap, cap_slave_process=data.cap2, cap_concurrent_page=data.cap3, all_job=data.total, offset=data.offset, max_page_level=data.max_page_level, max_page_limit=data.max_page_limit, loopback_database=data.loopback, refresh_rate=data.refresh_rate, filters=data.db_filter, crawl_matrix=data.crawl_matrix, ) if data.addtional_data is not None and isinstance(data.addtional_data, SlaveOperationData): self.add_slaves(self.server.addtional_obj, data.addtional_data) self.server.addtional_obj.setup_minging_slaves() self.server.addtional_obj.start() else: raise NotImplementedError("other data type is not implemented.") except Exception as ex: print(ex) ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Setup()") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Setup failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Clear_Cache: try: if s.is_alive(): s.stop() s.join() s.clear_host_cache() s.clear_slave_cache() except Exception as ex: print(ex) ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Clear_Cache()") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Setup failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Add_Seed: data = command.data if isinstance(data, MiningList): s.add_seeds(data) else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add Seed Failed, format is wrong in server handler." CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Add_Slave: # test this try: data = command.data if isinstance(data, SlaveOperationData): self.add_slaves(s, data) else: raise NotImplementedError("other data type is not implemented.") except Exception as ex: print(ex) reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add slave failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Del_Slave: # test this try: data = command.data if isinstance(data, SlaveOperationData): self.remove_slaves(s, data) else: raise NotImplementedError("other data type is not implemented.") except Exception as ex: print(ex) reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add slave failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Begin_Mining: # not implemented, use setup to begin mining reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add slave failed" elif command.cmd == ServerCommand.Com_Stop_Mining: # test this try: EC2 = EC2Controller("") addrs = [slave.address.address for slave in s.slaves if isinstance(slave, Server)] s.pause() #s.slaves.clear() #if s.isAlive: # s.join(0) #self.server.addtional_obj = MiningMasterController() EC2.shut_down_machines_list(addrs) except: reply.cmd = ServerCommand.Com_ReplyError reply.data = "Stop site failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Status: # test this try: CPU = MachineInfo.get_cpu(1) MEM = MachineInfo.get_memory() NET = MachineInfo.get_network(1) slaveStatus = [slave.status for slave in s.slaves] totalPage = sum([slave.total_page_done for slave in slaveStatus]) ave_page = 0 filter_progress = s.get_filter_progress() if len(s.slaves) > 0: ave_page = int(sum([slave.page_per_site for slave in slaveStatus])/len(s.slaves)) total_result = sum([slave.result for slave in slaveStatus]) total_cap_slave = sum([slave.cap_slave for slave in slaveStatus]) total_cap_process = sum([slave.cap_slave * slave.cap_process for slave in slaveStatus]) total_cap_page = sum([slave.cap_slave * slave.cap_process * slave.cap_concurrent_page for slave in slaveStatus]) status = ServerStatus(wait_job=s.job_all - s.job_done, done_job=s.job_done, all_job=s.job_all, total_page_done=totalPage, page_per_site=ave_page, result=total_result, cpu_cores=CPU[0], cpu_percent=CPU[1], toal_memory=MEM[0], memory_percent=MEM[1], net_recieved=NET[0], net_send=NET[1], cap_slave=total_cap_slave, cap_process= total_cap_process, cap_concurrent_page= total_cap_page, filter_done=filter_progress[0], filter_total=filter_progress[1]) server = Server(server_type=ServerType.ty_Host, status=status, address=ServerAddress("localhost", MiningTCPServer.DefaultListenPort)) servers = [] servers.append(server) servers += s.slaves reply.data = MiningList(s.ref, servers) except: reply.cmd = ServerCommand.Com_ReplyError reply.data = "getting status failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_DataBase_Status: # test this reply.data = s.get_db_stats() # send back a copy CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Set_DB_Filter: data = command.data if isinstance(data, DBFilterCollection): if data != s.filter_shadow: s.filter_shadow = data s.update_db_stats(True) else: reply.cmd =ServerCommand.Com_ReplyError reply.data = "wrong data type for filters, should be DBFilterCollection" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Progress: # this this reply.data = PrograssData(ref=s.ref, done=s.job_done, all_job=s.job_all,offset=s.offset, duration=s.end_time - s.start_time, in_progress=s.in_progress) CommandProcessor.send_command(out_buffer, reply) else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "command is not valid, please try again" CommandProcessor.send_command(out_buffer, reply) #CommandProcessor.send_command(out_buffer, reply) #print("finished cmd ", command.cmd) self.send_and_receive() # recursive to make a conversation
def send_and_receive(self): in_buffer = self.rfile out_buffer = self.wfile s = self.server.addtional_obj command = CommandProcessor.receive_command(in_buffer) #print("process cmd: ", command.cmd) if command is not None: reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK) if command.cmd == ServerCommand.Com_Start: #print("start conversation:") CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Stop_Mining: if s is not None and isinstance(s, SiteCheckProcessManager): s.set_stop() CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Setup: # test this data = command.data if isinstance(data, SetupData): cap2 = data.cap2 if cap2 == 0: cap2 = SlaveAutoScaler(data.max_page_level, data.max_page_limit).get_optimal_capacity() if isinstance(s, SiteCheckProcessManager): # need to fix this if s.is_alive(): s.set_stop() s.join() # elif isinstance(s, threading.Thread): # s.join(0) print("init new process manager with para: ", data.get_serializable(False)) total_memory =MachineInfo.get_memory()[0] mem_limit_per_crawler = int(total_memory * 0.8 / cap2) if mem_limit_per_crawler >= SiteCheckProcessManager.MEM_MINIMUM_REQ: self.server.addtional_obj = SiteCheckProcessManager(data.ref, max_procss=cap2, concurrent_page=data.cap3, page_max_level=data.max_page_level, max_page_per_site=data.max_page_limit, memory_limit_per_process=mem_limit_per_crawler) self.server.addtional_obj.start() else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "Not enough memory allocation for each crawler, must at least 100 Mb." CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Clear_Cache: if isinstance(s, SiteCheckProcessManager): # need to fix this if s.is_alive(): s.set_stop() s.join() s.clear_cache() CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Data: print("incoming data....") data = command.data if s is not None and isinstance(s, SiteCheckProcessManager): if data.data is not None and len(data.data) > 0: s.put_to_input_queue(data.data) else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "something is wrong with data" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Get_Data: # test this if s is not None and isinstance(s, SiteCheckProcessManager): rawdata = [] if s.get_temp_result_count() > 0: rawdata += [ScrapeDomainData(x.link, x.response_code) for x in s.get_temp_result_and_clear() if isinstance(x, OnSiteLink)] #print("sending back:") #print(rawdata) if s.get_site_info_list_count() > 0: rawdata += [info for info in s.get_site_info_list_and_clear() if isinstance(info, SeedSiteFeedback)] data = MiningList(s.name, rawdata) reply.data = data else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "something is wrong with return data" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Status: #print("send back status!") CPU = MachineInfo.get_cpu(1) MEM = MachineInfo.get_memory() NET = MachineInfo.get_network(1) if s is not None and isinstance(s, SiteCheckProcessManager): manager_state = s.get_state() filter_progress = s.get_filter_progress() status = ServerStatus(wait_job=manager_state.job_wait, done_job=manager_state.job_done, all_job=manager_state.job_all, total_page_done=manager_state.total_page_done, page_per_site=manager_state.average_page, result=manager_state.result_count, cpu_cores=CPU[0], cpu_percent=CPU[1], toal_memory=MEM[0], memory_percent=MEM[1], net_recieved=NET[0], net_send=NET[1], cap_slave=1, cap_process=s.max_prcess, cap_concurrent_page=s.concurrent_page, filter_done=filter_progress[0], filter_total=filter_progress[1]) reply.data = status else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "something is wrong with the crawler." CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Stop: #print("end conversation:") return else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "command is not valid, please try again" CommandProcessor.send_command(out_buffer, reply) #print("finished cmd", command.cmd) self.send_and_receive()