예제 #1
0
    def putrawres(self, rawres_list):
        def rawres_to_res(rawres, kw: Keyword):
            return Res(
                keyword=kw,
                link=rawres.reslink,
                web=rawres.weblink,
                type=rawres.type,
                filename=rawres.filename,
                filesize=rawres.filesize,
            )

        # 更新 task 最后一次活跃时间
        makelog("更新 task 最后一次活跃时间")
        self.last_active_time = time.time()
        # 更新状态和进度
        makelog("更新状态和进度")
        self.subtask_done_counter += 1
        self.progress = self.subtask_done_counter * 100 / self.subtask_total_counter
        if self.subtask_done_counter == self.subtask_total_counter:
            self.statu = "Done"
        else:
            self.statu = "Digging"
        # 收集资源实例
        if len(rawres_list) > 0:
            makelog("获取关键字")
            kw = Keyword.objects.get(keyword=self.keyword)
            makelog("收集资源实例")
            for rawres in rawres_list:
                self.reslist.append(rawres_to_res(rawres, kw))
        makelog("SubTask done! {}".format(self.keyword), 4)
예제 #2
0
파일: hub.py 프로젝트: iridesc/taskhub
    def add(self, task: Task):
        """[summary]

        Args:
            task (Task): [要添加的task]

        Raises:
            TaskAlreadyExist: [task key已经存在]

        Returns:
            [bool]: [成功返回True]
        """

        # 加锁
        self.get_lock()
        # 检查是否存在
        if not self.tasks.get(task.key):
            # 不存在则添加
            self.tasks[task.key] = task
            makelog("task added!", 4)
            # 释放锁并返回
            self.release_lock()
            return True
        else:
            # 存在则释放锁并抛出异常
            self.release_lock()
            raise TaskAlreadyExist()
예제 #3
0
 def udHots(self):
     try:
         hots = list(
             Keyword.objects.filter(
                 showInRec="True").order_by("-visitTimes")[0:50].values())
         self.hots = hots
     except Exception as e:
         makelog("Error in udhotkeylist!\n" + str(e), 1)
예제 #4
0
        def minitask():
            def get_source_code():
                sourcecode = ''
                try:
                    n = 0
                    status_code = 302
                    while status_code in [302, 301] and n < 3:
                        r = net(self.link, allow_redirects=False)
                        status_code = r.status_code
                        if status_code in [301, 302]:
                            self.link = r.headers['location']
                            n = n + 1
                        else:
                            r.encoding = r.apparent_encoding
                            # 收集网页源码
                            sourcecode = r.text
                except:
                    pass

                return sourcecode

            def get_rawres(sourcecode):
                # 匹配表达式
                th_r = re.compile(
                    r'''thunder://[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=]+'''
                )
                ed_r = re.compile(r'''ed2k://[\s\S]+?(?:[/])''')
                magnet_r = re.compile(
                    r'''magnet:\?\S+?(?=['"“”‘’《》<>$()():])''')

                # 匹配资源
                rawres_list = []
                for res_container in [
                    [th_r.findall(sourcecode), 'thunder'],
                    [ed_r.findall(sourcecode), 'ed2k'],
                    [magnet_r.findall(sourcecode), 'magnet'],
                ]:
                    for reslink in res_container[0]:
                        if len(reslink) < 800:
                            raw_res = RawRes(self.keyword, reslink, self.link,
                                             res_container[1])
                            # 补全信息
                            raw_res.reslinkparser()
                            # 加入列表
                            rawres_list.append(raw_res)
                return rawres_list

            # print('---------------------------')
            sourcecode = get_source_code()
            rawres_list = get_rawres(sourcecode)
            # 找到任务并放入rawres
            CACHE.rawres_upload(self.keyword, rawres_list)
            makelog('MiniTask Done!', 4)
예제 #5
0
        def parsetask():

            sengine = Bing(keyWord=self.keyword + ' 下载', amount=self.DEEPTH)
            makelog('search engine start', 3)
            results = sengine.Search()
            self.weblinklist = [res['link'] for res in results]

            # 上传SubTask
            CACHE.subtaskqueue_puts(self.keyword, [
                SubTask(task_type='MiniTask',
                        keyword=self.keyword,
                        weblink=weblink) for weblink in self.weblinklist
            ])
예제 #6
0
 def __init__(self, keyword, subtaskqueue):
     self.keyword = keyword
     self.statu = 'Initing'
     self.progress = 0
     self.subtask_done_counter = 0
     self.subtask_total_counter = 0
     self.reslist = []
     subtaskqueue.put(
         SubTask(
             task_type='ParseTask',
             keyword=keyword,
             DEEPTH=DEEPTH,
         ))
     self.last_active_time = time.time()
     makelog('Task inited {}'.format(self.keyword))
예제 #7
0
        def parsetask():

            sengine = Bing(keyWord=self.keyword + " 下载", amount=self.DEEPTH)
            makelog("search engine start", 3)
            results = sengine.Search()
            self.weblinklist = [res["link"] for res in results]

            # 上传SubTask
            CACHE.subtaskqueue_puts(
                self.keyword,
                [
                    SubTask(task_type="MiniTask",
                            keyword=self.keyword,
                            weblink=weblink) for weblink in self.weblinklist
                ],
            )
예제 #8
0
    def search(self, kw=None, amount=10):
        if kw != None and kw != self.keyword:
            self.pageNo, self.results = 0, []
            self.keyword = kw  #kw=None:keep
        started = time.time()
        while len(self.results) < amount and (time.time() -
                                              started) < self.timeout:
            try:
                self._addResult(self.info.url, self.getRequestParams())
                self.pageNo += 1
            except Exception as e:
                makelog(f"Failed to get page! {e}", 1)

        if len(self.results) < amount:
            makelog(f"Search {self.keyword} timed out", 2)
        return self.results
예제 #9
0
 def getPageResults():
     resp = self.session.get(url,
                             params=params,
                             headers=BaseEngine.headers)
     try:
         resp.raise_for_status()
     except:
         makelog(f"Connection error", 2)
     soup = BeautifulSoup(resp.text, "html.parser")
     res = self.parseResult(soup)
     if (len(res) == 0):
         makelog(f"Parse fail: {url} {params}", 2)
         with open("error.html", "w+") as f:
             f.write(resp.text)
         raise Exception()  #fail once
     return res
예제 #10
0
    def udSugs(self):

        try:
            n = 0
            newSugs = ""
            for kw in Keyword.objects.filter(
                    showInRec="True").order_by("-visitTimes")[0:20000]:
                if n > 10000 or kw.visitTimes < 2:
                    break
                elif checkKeyword(kw.keyword):
                    newSugs += kw.keyword + "*"
                    n += 1
            self.sugs = newSugs
            makelog(str(n) + " : " + str(len(self.sugs)))

        except Exception as e:
            makelog("Error in udSugs!\n" + str(e), 1)
예제 #11
0
    def __init__(self,
                 task_type: str,
                 keyword: str,
                 weblink=None,
                 DEEPTH=None):
        self.task_type = task_type
        self.keyword = keyword

        if DEEPTH != None and task_type == 'ParseTask':
            self.DEEPTH = DEEPTH

        elif task_type == 'MiniTask':
            self.link = weblink
        else:
            makelog('Task type error!', 1)
            raise
        makelog('SubTask inited:{}'.format(self.task_type), 4)
예제 #12
0
    def putrawres(self, rawres_list):
        def rawres_to_res(rawres):
            return Resourcetable(keyword=rawres.keyword,
                                 link=rawres.reslink,
                                 web=rawres.weblink,
                                 type=rawres.type,
                                 filename=rawres.filename,
                                 filesize=rawres.filesize)

        # 更新 task 最后一次活跃时间
        self.last_active_time = time.time()
        # 更新状态和进度
        self.subtask_done_counter += 1
        self.progress = self.subtask_done_counter * 100 / self.subtask_total_counter
        if self.subtask_done_counter == self.subtask_total_counter:
            self.statu = 'Done'
        else:
            self.statu = 'Digging'
        for rawres in rawres_list:
            self.reslist.append(rawres_to_res(rawres))
        makelog('SubTask done! {}'.format(self.keyword), 4)
예제 #13
0
def loaddetting():
    global ENGINENAME, PASSWORD, HOST, PORT, PROCESSAMOUNT
    with open('./setting.json') as f:
        setting = json.load(f)

    ENGINENAME = setting['EngineName']
    PASSWORD = setting['Password']
    HOST = setting['Host']
    PORT = setting['Port']
    process_override = setting['ProcessOverride']
    PROCESSAMOUNT = int(cpu_count() * float(process_override))

    makelog(
        'load setting success:\nEngineName:{}\nPassword:{}\nHost:{}\nPort:{}\nProcessOverride:{}\nProcess:{}'
        .format(
            ENGINENAME,
            PASSWORD,
            HOST,
            PORT,
            process_override,
            PROCESSAMOUNT,
        ), 2)
예제 #14
0
    def serve(self):
        t = 0
        while True:
            if time.time() - t > self.check_gap:

                try:
                    # 加锁
                    self.get_lock()
                    # 输出状态
                    status_count = dict()
                    for key, task in self.tasks.items():
                        status_count[task.status] = status_count[
                            task.status] + 1 if status_count.get(
                                task.status) else 1
                    status_str = ""
                    for statu, amount in status_count.items():
                        status_str += "{}:{}  ".format(statu, amount)
                    makelog(status_str)

                    # 检查需要同步的数据
                    for_del_key = []
                    for key, task in self.tasks.items():
                        if task.status == "done":
                            # 将数据同步到后端
                            if self.sync(task):
                                for_del_key.append(key)
                    # 删除同步成功的
                    for key in for_del_key:
                        del self.tasks[key]

                except:
                    makelog("未知异常:{}".format(traceback.format_exc()), 1)

                finally:
                    # 释放锁
                    self.release_lock()

                t = time.time()
                time.sleep(1)
예제 #15
0
def config():
    makelog('Enginex Config :', 2)
    setting = {
        'EngineName': None,
        'Password': None,
        'Host': '0.0.0.0',
        'Port': 23333,
        'ProcessOverride': 2.0,
    }
    try:
        usersetting = json.load(open('setting.json'))
    except:
        usersetting = setting

    for key, value in setting.items():
        v = input('please input {} \nrecomend: {}\ncurrent: {}:'.format(
            key, value, usersetting[key]))
        if v == '':
            setting[key] = usersetting[key]
        else:
            setting[key] = v

    json.dump(setting, open('setting.json', 'w'), ensure_ascii=False, indent=4)
    makelog('enginex config done!', 2)
예제 #16
0
    def _addResult(self, url, params):
        @retry(tries=BaseEngine.n_retries,
               delay=random.random() * BaseEngine.dt_retries)
        def getPageResults():
            resp = self.session.get(url,
                                    params=params,
                                    headers=BaseEngine.headers)
            try:
                resp.raise_for_status()
            except:
                makelog(f"Connection error", 2)
            soup = BeautifulSoup(resp.text, "html.parser")
            res = self.parseResult(soup)
            if (len(res) == 0):
                makelog(f"Parse fail: {url} {params}", 2)
                with open("error.html", "w+") as f:
                    f.write(resp.text)
                raise Exception()  #fail once
            return res

        makelog(
            f"{self.info.name} {self.keyword} #{self.pageNo} n={len(self.results)}"
        )
        self.results.extend(getPageResults())
예제 #17
0
def sync(task):

    back_end_url = "http://127.0.0.1/api/"

    @retry(tries=5, delay=1, backoff=1)
    def req(task):
        print(task)
        r = requests.post(back_end_url, json=task.get_dict())
        r.raise_for_status
        return r.json()

    makelog("syncing...")
    try:
        r_data = req(task)
    except:
        makelog("sync: req 时发生异常:{}".format(traceback.format_exc()), 1)
        return False
    else:
        if r_data.get("code") == 200:
            makelog("synced!", 4)
            return True
        else:
            makelog("sync: 返回状态异常:{}".format(r_data))
            return False
예제 #18
0
                    for res in Res.objects.filter(keyword=task.keyword)
                ]
                for res in task.reslist:
                    if res.link not in prelinklist:
                        savereslist.append(res)
                    else:
                        prelinklist.append(res.link)
                # 储存
                Res.objects.bulk_create(savereslist)
                # 删除任务
                self.tasks.remove(task)


if __name__ == "__main__":
    while True:
        makelog("Manager-x 2.0 start!", 2)
        try:
            # 启动服务
            cache = initManager(isManager=True,
                                obj=Cache(),
                                port=port,
                                password=password)

            reguler_list = [
                reguler("saveRes", 2, cache),
                reguler("udCasts", 10 * 60, cache),
                reguler("udDonors", 10 * 60, cache),
                reguler("udHots", 3 * 60 * 60, cache),
                reguler("udSugs", 3 * 60 * 60, cache),
                reguler("udResAmount", 24 * 60 * 60, cache),
                reguler("udKeywordAmount", 24 * 60 * 60, cache),
예제 #19
0
 def udCasts(self):
     try:
         self.casts = list(Cast.objects.filter(online="True").values())
     except Exception as e:
         makelog("Error in udCasts!\n" + str(e), 1)
예제 #20
0
 def udDonors(self):
     try:
         self.donors = list(Donor.objects.all().order_by("-time").values())
     except Exception as e:
         makelog("Error in uddonnateinfo!\n" + str(e), 1)
예제 #21
0
    def do(self):
        @retry(tries=2)
        def net(link, params=None, allow_redirects=True):
            UA = [
                'User-Agent,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                'User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                'User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
                'User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1    ',
                'User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
                'User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
            ]

            r = requests.get(link,
                             headers={'User-Agent': random.choice(UA)},
                             timeout=5,
                             params=params,
                             allow_redirects=allow_redirects)
            # print(r.request.url)
            r.raise_for_status()
            return r

        def minitask():
            def get_source_code():
                sourcecode = ''
                try:
                    n = 0
                    status_code = 302
                    while status_code in [302, 301] and n < 3:
                        r = net(self.link, allow_redirects=False)
                        status_code = r.status_code
                        if status_code in [301, 302]:
                            self.link = r.headers['location']
                            n = n + 1
                        else:
                            r.encoding = r.apparent_encoding
                            # 收集网页源码
                            sourcecode = r.text
                except:
                    pass

                return sourcecode

            def get_rawres(sourcecode):
                # 匹配表达式
                th_r = re.compile(
                    r'''thunder://[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=]+'''
                )
                ed_r = re.compile(r'''ed2k://[\s\S]+?(?:[/])''')
                magnet_r = re.compile(
                    r'''magnet:\?\S+?(?=['"“”‘’《》<>$()():])''')

                # 匹配资源
                rawres_list = []
                for res_container in [
                    [th_r.findall(sourcecode), 'thunder'],
                    [ed_r.findall(sourcecode), 'ed2k'],
                    [magnet_r.findall(sourcecode), 'magnet'],
                ]:
                    for reslink in res_container[0]:
                        if len(reslink) < 800:
                            raw_res = RawRes(self.keyword, reslink, self.link,
                                             res_container[1])
                            # 补全信息
                            raw_res.reslinkparser()
                            # 加入列表
                            rawres_list.append(raw_res)
                return rawres_list

            # print('---------------------------')
            sourcecode = get_source_code()
            rawres_list = get_rawres(sourcecode)
            # 找到任务并放入rawres
            CACHE.rawres_upload(self.keyword, rawres_list)
            makelog('MiniTask Done!', 4)

        def parsetask():

            sengine = Bing(keyWord=self.keyword + ' 下载', amount=self.DEEPTH)
            makelog('search engine start', 3)
            results = sengine.Search()
            self.weblinklist = [res['link'] for res in results]

            # 上传SubTask
            CACHE.subtaskqueue_puts(self.keyword, [
                SubTask(task_type='MiniTask',
                        keyword=self.keyword,
                        weblink=weblink) for weblink in self.weblinklist
            ])

        makelog('{} Start!'.format(self.task_type), 3)

        if self.task_type == 'MiniTask':
            minitask()
        elif self.task_type == 'ParseTask':
            parsetask()
        else:
            makelog('Error unknow task{}'.format(self.task_type), 1)
예제 #22
0
        else:
            setting[key] = v

    json.dump(setting, open('setting.json', 'w'), ensure_ascii=False, indent=4)
    makelog('enginex config done!', 2)


def subtask_pool_fuc(subtask):
    subtask.do()


if __name__ == '__main__':
    # 载入设置
    loaddetting()
    while True:
        makelog('Enginex 4.3 start !', 2)
        try:
            # 连接到服务器
            CACHE = initManager(host=HOST, port=PORT, password=PASSWORD)
            makelog('Manager-x connected !', 2)

            # 建立进程池
            task_pool = Pool(processes=PROCESSAMOUNT, maxtasksperchild=1)
            # 建立一个结果清理队列
            results = []
            applyed_count = PROCESSAMOUNT

            # 循环检查新任务
            engine_status_update_time = 0
            while True:
                now_time = time.time()