def putrawres(self, rawres_list): def rawres_to_res(rawres, kw: Keyword): return Res( keyword=kw, link=rawres.reslink, web=rawres.weblink, type=rawres.type, filename=rawres.filename, filesize=rawres.filesize, ) # 更新 task 最后一次活跃时间 makelog("更新 task 最后一次活跃时间") self.last_active_time = time.time() # 更新状态和进度 makelog("更新状态和进度") self.subtask_done_counter += 1 self.progress = self.subtask_done_counter * 100 / self.subtask_total_counter if self.subtask_done_counter == self.subtask_total_counter: self.statu = "Done" else: self.statu = "Digging" # 收集资源实例 if len(rawres_list) > 0: makelog("获取关键字") kw = Keyword.objects.get(keyword=self.keyword) makelog("收集资源实例") for rawres in rawres_list: self.reslist.append(rawres_to_res(rawres, kw)) makelog("SubTask done! {}".format(self.keyword), 4)
def add(self, task: Task): """[summary] Args: task (Task): [要添加的task] Raises: TaskAlreadyExist: [task key已经存在] Returns: [bool]: [成功返回True] """ # 加锁 self.get_lock() # 检查是否存在 if not self.tasks.get(task.key): # 不存在则添加 self.tasks[task.key] = task makelog("task added!", 4) # 释放锁并返回 self.release_lock() return True else: # 存在则释放锁并抛出异常 self.release_lock() raise TaskAlreadyExist()
def udHots(self): try: hots = list( Keyword.objects.filter( showInRec="True").order_by("-visitTimes")[0:50].values()) self.hots = hots except Exception as e: makelog("Error in udhotkeylist!\n" + str(e), 1)
def minitask(): def get_source_code(): sourcecode = '' try: n = 0 status_code = 302 while status_code in [302, 301] and n < 3: r = net(self.link, allow_redirects=False) status_code = r.status_code if status_code in [301, 302]: self.link = r.headers['location'] n = n + 1 else: r.encoding = r.apparent_encoding # 收集网页源码 sourcecode = r.text except: pass return sourcecode def get_rawres(sourcecode): # 匹配表达式 th_r = re.compile( r'''thunder://[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=]+''' ) ed_r = re.compile(r'''ed2k://[\s\S]+?(?:[/])''') magnet_r = re.compile( r'''magnet:\?\S+?(?=['"“”‘’《》<>$()():])''') # 匹配资源 rawres_list = [] for res_container in [ [th_r.findall(sourcecode), 'thunder'], [ed_r.findall(sourcecode), 'ed2k'], [magnet_r.findall(sourcecode), 'magnet'], ]: for reslink in res_container[0]: if len(reslink) < 800: raw_res = RawRes(self.keyword, reslink, self.link, res_container[1]) # 补全信息 raw_res.reslinkparser() # 加入列表 rawres_list.append(raw_res) return rawres_list # print('---------------------------') sourcecode = get_source_code() rawres_list = get_rawres(sourcecode) # 找到任务并放入rawres CACHE.rawres_upload(self.keyword, rawres_list) makelog('MiniTask Done!', 4)
def parsetask(): sengine = Bing(keyWord=self.keyword + ' 下载', amount=self.DEEPTH) makelog('search engine start', 3) results = sengine.Search() self.weblinklist = [res['link'] for res in results] # 上传SubTask CACHE.subtaskqueue_puts(self.keyword, [ SubTask(task_type='MiniTask', keyword=self.keyword, weblink=weblink) for weblink in self.weblinklist ])
def __init__(self, keyword, subtaskqueue): self.keyword = keyword self.statu = 'Initing' self.progress = 0 self.subtask_done_counter = 0 self.subtask_total_counter = 0 self.reslist = [] subtaskqueue.put( SubTask( task_type='ParseTask', keyword=keyword, DEEPTH=DEEPTH, )) self.last_active_time = time.time() makelog('Task inited {}'.format(self.keyword))
def parsetask(): sengine = Bing(keyWord=self.keyword + " 下载", amount=self.DEEPTH) makelog("search engine start", 3) results = sengine.Search() self.weblinklist = [res["link"] for res in results] # 上传SubTask CACHE.subtaskqueue_puts( self.keyword, [ SubTask(task_type="MiniTask", keyword=self.keyword, weblink=weblink) for weblink in self.weblinklist ], )
def search(self, kw=None, amount=10): if kw != None and kw != self.keyword: self.pageNo, self.results = 0, [] self.keyword = kw #kw=None:keep started = time.time() while len(self.results) < amount and (time.time() - started) < self.timeout: try: self._addResult(self.info.url, self.getRequestParams()) self.pageNo += 1 except Exception as e: makelog(f"Failed to get page! {e}", 1) if len(self.results) < amount: makelog(f"Search {self.keyword} timed out", 2) return self.results
def getPageResults(): resp = self.session.get(url, params=params, headers=BaseEngine.headers) try: resp.raise_for_status() except: makelog(f"Connection error", 2) soup = BeautifulSoup(resp.text, "html.parser") res = self.parseResult(soup) if (len(res) == 0): makelog(f"Parse fail: {url} {params}", 2) with open("error.html", "w+") as f: f.write(resp.text) raise Exception() #fail once return res
def udSugs(self): try: n = 0 newSugs = "" for kw in Keyword.objects.filter( showInRec="True").order_by("-visitTimes")[0:20000]: if n > 10000 or kw.visitTimes < 2: break elif checkKeyword(kw.keyword): newSugs += kw.keyword + "*" n += 1 self.sugs = newSugs makelog(str(n) + " : " + str(len(self.sugs))) except Exception as e: makelog("Error in udSugs!\n" + str(e), 1)
def __init__(self, task_type: str, keyword: str, weblink=None, DEEPTH=None): self.task_type = task_type self.keyword = keyword if DEEPTH != None and task_type == 'ParseTask': self.DEEPTH = DEEPTH elif task_type == 'MiniTask': self.link = weblink else: makelog('Task type error!', 1) raise makelog('SubTask inited:{}'.format(self.task_type), 4)
def putrawres(self, rawres_list): def rawres_to_res(rawres): return Resourcetable(keyword=rawres.keyword, link=rawres.reslink, web=rawres.weblink, type=rawres.type, filename=rawres.filename, filesize=rawres.filesize) # 更新 task 最后一次活跃时间 self.last_active_time = time.time() # 更新状态和进度 self.subtask_done_counter += 1 self.progress = self.subtask_done_counter * 100 / self.subtask_total_counter if self.subtask_done_counter == self.subtask_total_counter: self.statu = 'Done' else: self.statu = 'Digging' for rawres in rawres_list: self.reslist.append(rawres_to_res(rawres)) makelog('SubTask done! {}'.format(self.keyword), 4)
def loaddetting(): global ENGINENAME, PASSWORD, HOST, PORT, PROCESSAMOUNT with open('./setting.json') as f: setting = json.load(f) ENGINENAME = setting['EngineName'] PASSWORD = setting['Password'] HOST = setting['Host'] PORT = setting['Port'] process_override = setting['ProcessOverride'] PROCESSAMOUNT = int(cpu_count() * float(process_override)) makelog( 'load setting success:\nEngineName:{}\nPassword:{}\nHost:{}\nPort:{}\nProcessOverride:{}\nProcess:{}' .format( ENGINENAME, PASSWORD, HOST, PORT, process_override, PROCESSAMOUNT, ), 2)
def serve(self): t = 0 while True: if time.time() - t > self.check_gap: try: # 加锁 self.get_lock() # 输出状态 status_count = dict() for key, task in self.tasks.items(): status_count[task.status] = status_count[ task.status] + 1 if status_count.get( task.status) else 1 status_str = "" for statu, amount in status_count.items(): status_str += "{}:{} ".format(statu, amount) makelog(status_str) # 检查需要同步的数据 for_del_key = [] for key, task in self.tasks.items(): if task.status == "done": # 将数据同步到后端 if self.sync(task): for_del_key.append(key) # 删除同步成功的 for key in for_del_key: del self.tasks[key] except: makelog("未知异常:{}".format(traceback.format_exc()), 1) finally: # 释放锁 self.release_lock() t = time.time() time.sleep(1)
def config(): makelog('Enginex Config :', 2) setting = { 'EngineName': None, 'Password': None, 'Host': '0.0.0.0', 'Port': 23333, 'ProcessOverride': 2.0, } try: usersetting = json.load(open('setting.json')) except: usersetting = setting for key, value in setting.items(): v = input('please input {} \nrecomend: {}\ncurrent: {}:'.format( key, value, usersetting[key])) if v == '': setting[key] = usersetting[key] else: setting[key] = v json.dump(setting, open('setting.json', 'w'), ensure_ascii=False, indent=4) makelog('enginex config done!', 2)
def _addResult(self, url, params): @retry(tries=BaseEngine.n_retries, delay=random.random() * BaseEngine.dt_retries) def getPageResults(): resp = self.session.get(url, params=params, headers=BaseEngine.headers) try: resp.raise_for_status() except: makelog(f"Connection error", 2) soup = BeautifulSoup(resp.text, "html.parser") res = self.parseResult(soup) if (len(res) == 0): makelog(f"Parse fail: {url} {params}", 2) with open("error.html", "w+") as f: f.write(resp.text) raise Exception() #fail once return res makelog( f"{self.info.name} {self.keyword} #{self.pageNo} n={len(self.results)}" ) self.results.extend(getPageResults())
def sync(task): back_end_url = "http://127.0.0.1/api/" @retry(tries=5, delay=1, backoff=1) def req(task): print(task) r = requests.post(back_end_url, json=task.get_dict()) r.raise_for_status return r.json() makelog("syncing...") try: r_data = req(task) except: makelog("sync: req 时发生异常:{}".format(traceback.format_exc()), 1) return False else: if r_data.get("code") == 200: makelog("synced!", 4) return True else: makelog("sync: 返回状态异常:{}".format(r_data)) return False
for res in Res.objects.filter(keyword=task.keyword) ] for res in task.reslist: if res.link not in prelinklist: savereslist.append(res) else: prelinklist.append(res.link) # 储存 Res.objects.bulk_create(savereslist) # 删除任务 self.tasks.remove(task) if __name__ == "__main__": while True: makelog("Manager-x 2.0 start!", 2) try: # 启动服务 cache = initManager(isManager=True, obj=Cache(), port=port, password=password) reguler_list = [ reguler("saveRes", 2, cache), reguler("udCasts", 10 * 60, cache), reguler("udDonors", 10 * 60, cache), reguler("udHots", 3 * 60 * 60, cache), reguler("udSugs", 3 * 60 * 60, cache), reguler("udResAmount", 24 * 60 * 60, cache), reguler("udKeywordAmount", 24 * 60 * 60, cache),
def udCasts(self): try: self.casts = list(Cast.objects.filter(online="True").values()) except Exception as e: makelog("Error in udCasts!\n" + str(e), 1)
def udDonors(self): try: self.donors = list(Donor.objects.all().order_by("-time").values()) except Exception as e: makelog("Error in uddonnateinfo!\n" + str(e), 1)
def do(self): @retry(tries=2) def net(link, params=None, allow_redirects=True): UA = [ 'User-Agent,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;', 'User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1 ', 'User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', 'User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', ] r = requests.get(link, headers={'User-Agent': random.choice(UA)}, timeout=5, params=params, allow_redirects=allow_redirects) # print(r.request.url) r.raise_for_status() return r def minitask(): def get_source_code(): sourcecode = '' try: n = 0 status_code = 302 while status_code in [302, 301] and n < 3: r = net(self.link, allow_redirects=False) status_code = r.status_code if status_code in [301, 302]: self.link = r.headers['location'] n = n + 1 else: r.encoding = r.apparent_encoding # 收集网页源码 sourcecode = r.text except: pass return sourcecode def get_rawres(sourcecode): # 匹配表达式 th_r = re.compile( r'''thunder://[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=]+''' ) ed_r = re.compile(r'''ed2k://[\s\S]+?(?:[/])''') magnet_r = re.compile( r'''magnet:\?\S+?(?=['"“”‘’《》<>$()():])''') # 匹配资源 rawres_list = [] for res_container in [ [th_r.findall(sourcecode), 'thunder'], [ed_r.findall(sourcecode), 'ed2k'], [magnet_r.findall(sourcecode), 'magnet'], ]: for reslink in res_container[0]: if len(reslink) < 800: raw_res = RawRes(self.keyword, reslink, self.link, res_container[1]) # 补全信息 raw_res.reslinkparser() # 加入列表 rawres_list.append(raw_res) return rawres_list # print('---------------------------') sourcecode = get_source_code() rawres_list = get_rawres(sourcecode) # 找到任务并放入rawres CACHE.rawres_upload(self.keyword, rawres_list) makelog('MiniTask Done!', 4) def parsetask(): sengine = Bing(keyWord=self.keyword + ' 下载', amount=self.DEEPTH) makelog('search engine start', 3) results = sengine.Search() self.weblinklist = [res['link'] for res in results] # 上传SubTask CACHE.subtaskqueue_puts(self.keyword, [ SubTask(task_type='MiniTask', keyword=self.keyword, weblink=weblink) for weblink in self.weblinklist ]) makelog('{} Start!'.format(self.task_type), 3) if self.task_type == 'MiniTask': minitask() elif self.task_type == 'ParseTask': parsetask() else: makelog('Error unknow task{}'.format(self.task_type), 1)
else: setting[key] = v json.dump(setting, open('setting.json', 'w'), ensure_ascii=False, indent=4) makelog('enginex config done!', 2) def subtask_pool_fuc(subtask): subtask.do() if __name__ == '__main__': # 载入设置 loaddetting() while True: makelog('Enginex 4.3 start !', 2) try: # 连接到服务器 CACHE = initManager(host=HOST, port=PORT, password=PASSWORD) makelog('Manager-x connected !', 2) # 建立进程池 task_pool = Pool(processes=PROCESSAMOUNT, maxtasksperchild=1) # 建立一个结果清理队列 results = [] applyed_count = PROCESSAMOUNT # 循环检查新任务 engine_status_update_time = 0 while True: now_time = time.time()