Exemplos de Spider em Python, exemplos de DanmuSpider.Spider em Python

Exemplo n.º 1

0

Exibir arquivo

    def listen_ss(self, p: str, time_str: str, interval_sec: int = 60):
        """
        此函数独立计算等待时间,不与BangumiController.py使用.
        预定时间来滚动获取新番的最新弹幕.
        在初始化时使用同一季的任意一集的url即可.
        如果是已经发布的集数, time_str填写当前时间即可.
        填写时间的目的是为了减少不必要的检查"目标集数是否可用"的次数, 减少被ban的概率.
        对于未来的集数, 脚本会在到时间之后启动, 即使番剧推迟几分钟公开也不会报错.

        在检测到相应的剧集可以观看时开始获取弹幕.
        通过计算相邻两次获取的弹幕增量, 动态调整获取弹幕的时间间隔.
        :param p: 视频分p,即集数,未发布也可(只要在初始化时是处于同一个系列的就可以"
        :param time_str: 视频更新时间,格式为 "yyyy-mm-ddThh:mm",例如 "2020-01-02T03:04"
        :param interval_sec: 每次获取间隔的初始时间, 时间 > 10秒
        :return:
        """
        target_time = Converter.str_to_timestamp(time_str)
        interval_sec = max(interval_sec, 11)
        sec_wait = max(11, target_time - int(time.time()))
        print("wait:", sec_wait, "seconds")
        time.sleep(sec_wait - 10)

        # 循环监测视频是否可用
        while True:
            url = "https://www.bilibili.com/bangumi/play/" + self.ssid
            response = Spider.get_html(url)
            ep_json = self.get_epinfo_in_html(response)
            new_series = ep_json['epList']
            if len(new_series) >= int(p):
                print("符合条件开始获取")
                time.sleep(5)
                target_ep = new_series[int(p)-1]["id"]
                new_url = "https://www.bilibili.com/bangumi/play/ep" + str(target_ep)
                self._get_info_ep(new_url)
                break
            print("未找到相应剧集,等待", interval_sec, "秒")
            time.sleep(interval_sec)

        previous_danmu = None
        while True:
            content_bytes = Spider.get_current_danmu(self.cid, self.url)
            now = datetime.fromtimestamp(time.time(), timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S')
            print(now, "获取了弹幕")
            with open(self.fileName + '_latest_' + str(int(time.time())) + '.xml', 'wb') as f:
                f.write(content_bytes)
            danmu = DanmuFile.init_from_str(content_bytes.decode('utf-8'))
            if previous_danmu is not None:
                _, inc, _ = DanmuCombinator.diff(previous_danmu, danmu)
                ratio = len(inc) / int(danmu.max_limit)
                print("时间比例:", ratio, )
                if ratio > 0.5:
                    interval_sec = int(interval_sec / 5)
                    print("时间间隔修改为:", interval_sec)
                if ratio < 0.3:
                    interval_sec = min(int(interval_sec * 1.5), 1800)
                    print("时间间隔修改为:", interval_sec)
            previous_danmu = danmu
            time.sleep(int(interval_sec))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: DanmuMaster.py Projeto: Od1gree/BiliDanmuToolkit

 def listen_ss_once(self):
     content_bytes = Spider.get_current_danmu(self.cid, self.url)
     if content_bytes is None:
         return -1
     now = datetime.fromtimestamp(time.time(), timezone(
         timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S')
     print('[TASK]', now, "获取:[", self.title, "]")
     with open(self.fileName + '_latest_' + str(int(time.time())) + '.xml',
               'wb') as f:
         f.write(content_bytes)
     danmu = DanmuFile.init_from_str(content_bytes.decode('utf-8'))
     ratio = -1
     if self.danmu_set is not None:
         dep, inc, com = DanmuCombinator.diff(self.danmu_set, danmu)
         dep_int, inc_int, com_int = len(dep), len(inc), len(com)
         print("[TASK] 原有弹幕数[",
               dep_int + com_int,
               "], 新增弹幕数[",
               inc_int,
               end=' ], ')
         ratio = inc_int / int(danmu.max_limit)
         print("算得新增比例:[", format(ratio, '0.5f'), "]")
     else:
         print("[TASK]首次获取")
     self.danmu_set = danmu
     self.timeProgress = int(time.time())
     return ratio

Exemplo n.º 3

0

Exibir arquivo

 def _get_current_danmu(self):
     content_bytes = Spider.get_current_danmu(self.cid, self.url)
     if content_bytes is None:
         return False
     # 将要与历史弹幕整合的弹幕文件
     with open(self.fileName+'.xml', 'wb') as f:
         f.write(content_bytes)
     # 当前弹幕池(上限数量)的弹幕
     with open(self.fileName+'_latest.xml', 'wb') as f:
         f.write(content_bytes)
     return True

Exemplo n.º 4

0

Exibir arquivo

 def _get_info_av(self, url: str):
     html = Spider.get_html(url)
     pattern = re.compile(r'"cid":(\d+),"page":%s' % self.page)
     pattern1 = re.compile(r'"title":"(.*?)","pubdate":(\d+)')
     self.cid = re.search(pattern, html).group(1)
     self.title, timeUnix_str = re.search(pattern1, html).groups()
     self.timeUnix = int(timeUnix_str)
     folder = "harvest/" + self.no + '_' + DanmuMaster.process_filename(self.title) + "/"
     if not os.path.exists(folder):
         os.mkdir(folder)
     file_name = DanmuMaster.process_filename(self.no + '_' + self.title + '_p' + self.page)
     self.fileName = folder + file_name

Exemplo n.º 5

0

Exibir arquivo

 def _get_history_danmu(self, date: str):
     """
     send history danmu request in specific date_str, return xml_str
     :param date: date string in 'YYYY-MM-DD' format
     :return: xml string in UTF-8 encoding
     """
     content_bytes = Spider.get_history_danmu(self.cid, self.url, date, self.cookie_path)
     xml_str = content_bytes.decode('utf-8')
     with open(self.fileName + '_' + date + '.xml', 'wb') as f:
         f.write(content_bytes)
     print('data length', len(xml_str))
     return xml_str

Exemplo n.º 6

0

Exibir arquivo

 def check_ep_exist(self):
     response = Spider.get_html(self.url)
     if response is None:
         print("[WARNING] 未获取到:", self.title)
         return False
     ep_json = self.get_epinfo_in_html(response)
     ep_int = int(self.no[2:])
     new_series = ep_json['epList']
     for ep in new_series:
         if ep['id'] == ep_int:
             self.init_from_ep_json(ep_json, ep_int, self.cookie_path)
             print("[TASK] 新剧集:", ep_json['h1Title'], "已放出")
             return True
     return False

Exemplo n.º 7

0

Exibir arquivo

Arquivo: DanmuMaster.py Projeto: Od1gree/BiliDanmuToolkit

    def _get_info_ep(
        self,
        url: str,
    ):
        html = None
        none_times = 0
        while html is None and none_times < 5:
            html = Spider.get_html(url)
            none_times += 1
        if html is None:
            print("多次请求番剧信息失败")
            exit(1)

        ep_json = self.get_epinfo_in_html(html)
        self._resolve_ep_json(ep_json)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: DanmuMaster.py Projeto: Od1gree/BiliDanmuToolkit

 def _get_current_danmu(self):
     content_bytes = None
     none_count = 0
     while content_bytes is None and none_count < 5:
         content_bytes = Spider.get_history_danmu(self.cid, self.url, date,
                                                  self.cookie_path)
         none_count += 1
     if content_bytes is None:
         print("多次请求失败.")
         return False
     # 将要与历史弹幕整合的弹幕文件
     with open(self.fileName + '.xml', 'wb') as f:
         f.write(content_bytes)
     # 当前弹幕池(上限数量)的弹幕
     with open(self.fileName + '_latest.xml', 'wb') as f:
         f.write(content_bytes)
     return True

Exemplo n.º 9

0

Exibir arquivo

    def _get_info_ep(self, url: str,):
        html = Spider.get_html(url)

        ep_json = self.get_epinfo_in_html(html)
        self._resolve_ep_json(ep_json)

Exemplo n.º 10

0

Exibir arquivo

 def _get_history_month(self, month: str):
     content_bytes = Spider.get_history_month(self.cid, self.url, month, self.cookie_path)
     json_str = content_bytes.decode('utf-8')
     return json_str