Python ParserBeautifulSoup примеры, epg2xml.providers.ParserBeautifulSoup Python примеры использования

Пример #1

0

Показать файл

 def get_svc_channels(self):
     url = "https://search.daum.net/search?DA=B3T&w=tot&rtmaxcoll=B3T&q={}"
     channelcate = ["지상파", "종합편성", "케이블", "스카이라이프", "해외위성", "라디오"]
     for c in channelcate:
         search_url = url.format(f"{c} 편성표")
         data = self.request(search_url, None, method="GET", output="html")
         soup = BeautifulSoup(data)
         if not soup.find_all(attrs={"disp-attr": "B3T"}):
             continue
         all_channels = [
             str(x.text.strip()) for x in soup.select(
                 'div[id="channelNaviLayer"] > div[class="layer_tv layer_all scroll"] > div > ul > li'
             )
         ]
         if not all_channels:
             all_channels += [
                 str(x.text.strip())
                 for x in soup.select('div[class="wrap_sub"] > span > a')
             ]
         svc_cate = c.replace("스카이라이프", "SKYLIFE")
         self.svc_channel_list += [{
             "Name": x,
             "ServiceId": f"{svc_cate} {x}",
             "Category": c,
         } for x in all_channels]

Пример #2

0

Показать файл

Файл: lg.py Проект: epg2xml/epg2xml

 def get_svc_channels(self):
     channelcate = [
         {
             "name": "지상파",
             "category": "00"
         },
         {
             "name": "스포츠/취미",
             "category": "01"
         },
         {
             "name": "영화",
             "category": "02"
         },
         {
             "name": "뉴스/경제",
             "category": "03"
         },
         {
             "name": "교양/다큐",
             "category": "04"
         },
         {
             "name": "여성/오락",
             "category": "05"
         },
         {
             "name": "어린이/교육",
             "category": "06"
         },
         {
             "name": "홈쇼핑",
             "category": "07"
         },
         {
             "name": "공공/종교",
             "category": "08"
         },
     ]
     p_name = re.compile(r".+(?=[(])")
     p_no = re.compile(r"(?<=Ch[.])\d+")
     p_svcid = re.compile(r"(?<=[('])\d+(?=[',])")
     url = "https://www.uplus.co.kr/css/chgi/chgi/RetrieveTvChannel.hpi"
     params = {"code": "12810"}
     for c in channelcate:
         params.update({"category": c["category"]})
         soup = BeautifulSoup(
             self.request(url, params, method="GET", output="html"))
         for ch in soup.select('li > a[name="chList"]'):
             ch_txt = ch.text
             self.svc_channel_list.append({
                 "Name":
                 p_name.search(ch_txt).group(),
                 "No":
                 str(p_no.search(ch_txt).group()),
                 "ServiceId":
                 p_svcid.search(ch["onclick"]).group(),
                 "Category":
                 c["name"],
             })

Пример #3

0

Показать файл

 def get_svc_channels(self):
     url = "https://tv.kt.com/tv/channel/pChList.asp"
     params = {"ch_type": "1", "parent_menu_id": "0"}
     soup = BeautifulSoup(
         self.request(url, params, method="POST", output="html"))
     raw_channels = [
         unquote(x.find("span", {
             "class": "ch"
         }).text.strip()) for x in soup.select("li > a")
     ]
     # 몇몇 채널은 (TV로만 제공, 유료채널) 웹에서 막혀있지만 실제로는 데이터가 있을 수 있다.
     self.svc_channel_list = [{
         "Name": " ".join(x.split()[1:]),
         "No": str(x.split()[0]),
         "ServiceId": x.split()[0]
     } for x in raw_channels]

Пример #4

0

Показать файл

 def get_programs(self, lazy_write=False):
     url = "https://tv.kt.com/tv/channel/pSchedule.asp"
     params = {
         "ch_type": "1",  # 1: live 2: skylife 3: uhd live 4: uhd skylife
         "view_type": "1",  # 1: daily 2: weekly
         "service_ch_no": "SVCID",
         "seldate": "EPGDATE",
     }
     for idx, _ch in enumerate(self.req_channels):
         log.info("%03d/%03d %s", idx + 1, len(self.req_channels), _ch)
         for nd in range(int(self.cfg["FETCH_LIMIT"])):
             day = date.today() + timedelta(days=nd)
             params.update({
                 "service_ch_no": _ch.svcid,
                 "seldate": day.strftime("%Y%m%d")
             })
             try:
                 data = self.request(url,
                                     params,
                                     method="POST",
                                     output="html")
                 soup = BeautifulSoup(data,
                                      parse_only=SoupStrainer("tbody"))
                 for row in soup.find_all("tr"):
                     cell = row.find_all("td")
                     hour = cell[0].text.strip()
                     for minute, program, category in zip(
                             cell[1].find_all("p"), cell[2].find_all("p"),
                             cell[3].find_all("p")):
                         _prog = EPGProgram(_ch.id)
                         _prog.stime = datetime.strptime(
                             f"{str(day)} {hour}:{minute.text.strip()}",
                             "%Y-%m-%d %H:%M")
                         _prog.title = program.text.replace("방송중 ",
                                                            "").strip()
                         _prog.category = category.text.strip()
                         for image in program.find_all("img", alt=True):
                             grade = re.match(r"([\d,]+)", image["alt"])
                             _prog.rating = int(
                                 grade.group(1)) if grade else 0
                         _ch.programs.append(_prog)
             except Exception:
                 log.exception("파싱 에러: %s", _ch)
         if not lazy_write:
             _ch.to_xml(self.cfg, no_endtime=self.no_endtime)

Пример #5

0

Показать файл

    def get_programs(self, lazy_write=False):
        url = "https://search.daum.net/search?DA=B3T&w=tot&rtmaxcoll=B3T&q={}"
        for idx, _ch in enumerate(self.req_channels):
            log.info("%03d/%03d %s", idx + 1, len(self.req_channels), _ch)
            search_url = url.format(quote(_ch.svcid + " 편성표"))
            data = self.request(search_url, None, method="GET", output="html")
            soup = BeautifulSoup(data)
            if not soup.find_all(attrs={"disp-attr": "B3T"}):
                log.warning("EPG 정보가 없거나 없는 채널입니다: %s", _ch)
                continue
            days = soup.select(
                'div[class="tbl_head head_type2"] > span > span[class="date"]')

            # 연도 추정
            currdate = datetime.now()  # 언제나 basedate보다 미래
            basedate = datetime.strptime(days[0].text.strip(),
                                         "%m.%d").replace(year=currdate.year)
            if (basedate - currdate).days > 0:
                basedate = basedate.replace(year=basedate.year - 1)

            for nd, _ in enumerate(days):
                hours = soup.select(
                    f'[id="tvProgramListWrap"] > table > tbody > tr > td:nth-of-type({nd+1})'
                )
                if len(hours) != 24:
                    log.warning("24개의 시간 행이 있어야 합니다: %s, 현재: %d", _ch,
                                len(hours))
                    break
                for nh, hour in enumerate(hours):
                    for dl in hour.select("dl"):
                        _prog = EPGProgram(_ch.id)
                        nm = int(dl.select("dt")[0].text.strip())
                        _prog.stime = basedate + timedelta(
                            days=nd, hours=nh, minutes=nm)
                        for atag in dl.select("dd > a"):
                            _prog.title = atag.text.strip()
                            # TODO: Get more details via daum search
                        for span in dl.select("dd > span"):
                            class_val = " ".join(span["class"])
                            if class_val == "":
                                _prog.title = span.text.strip()
                            elif "ico_re" in class_val:
                                _prog.rebroadcast = True
                            elif "ico_rate" in class_val:
                                _prog.rating = int(
                                    class_val.split("ico_rate")[1].strip())
                            else:
                                # ico_live ico_hd ico_subtitle ico_hand ico_uhd ico_talk ico_st
                                _prog.extras.append(span.text.strip())
                        match = re.compile(self.title_regex).search(
                            _prog.title)
                        _prog.title = match.group("title") or None
                        _prog.part_num = match.group("part") or None
                        _prog.ep_num = match.group("epnum") or ""
                        _prog.title_sub = match.group("subname1") or ""
                        _prog.title_sub = match.group(
                            "subname2") or _prog.title_sub
                        if _prog.part_num:
                            _prog.title += f" {_prog.part_num}부"
                        _ch.programs.append(_prog)
            if not lazy_write:
                _ch.to_xml(self.cfg, no_endtime=self.no_endtime)

Пример #6

0

Показать файл

    def get_programs(self, lazy_write=False):
        max_ndays = 4
        if int(self.cfg["FETCH_LIMIT"]) > max_ndays:
            log.warning(
                """

***********************************************************************

%s는 당일포함 %d일치만 EPG를 제공하고 있습니다.

***********************************************************************
        """,
                self.provider_name,
                max_ndays,
            )
        url = "http://m.skbroadband.com/content/realtime/Channel_List.do"
        params = {"key_depth2": "SVCID", "key_depth3": "EPGDATE"}

        for idx, _ch in enumerate(self.req_channels):
            log.info("%03d/%03d %s", idx + 1, len(self.req_channels), _ch)
            for nd in range(min(int(self.cfg["FETCH_LIMIT"]), max_ndays)):
                day = date.today() + timedelta(days=nd)
                params.update({
                    "key_depth2": _ch.svcid,
                    "key_depth3": day.strftime("%Y%m%d")
                })
                try:
                    data = self.request(url,
                                        params,
                                        method="GET",
                                        output="html")
                    data = re.sub("EUC-KR", "utf-8", data)
                    data = re.sub("<!--(.*?)-->", "", data, 0, re.I | re.S)
                    data = re.sub(
                        '<span class="round_flag flag02">(.*?)</span>', "",
                        data)
                    data = re.sub(
                        '<span class="round_flag flag03">(.*?)</span>', "",
                        data)
                    data = re.sub(
                        '<span class="round_flag flag04">(.*?)</span>', "",
                        data)
                    data = re.sub(
                        '<span class="round_flag flag09">(.*?)</span>', "",
                        data)
                    data = re.sub(
                        '<span class="round_flag flag10">(.*?)</span>', "",
                        data)
                    data = re.sub(
                        '<span class="round_flag flag11">(.*?)</span>', "",
                        data)
                    data = re.sub(
                        '<span class="round_flag flag12">(.*?)</span>', "",
                        data)
                    data = re.sub('<strong class="hide">프로그램 안내</strong>', "",
                                  data)
                    data = re.sub('<p class="cont">(.*)',
                                  partial(SK.replacement, tag="p"), data)
                    data = re.sub('<p class="tit">(.*)',
                                  partial(SK.replacement, tag="p"), data)
                    strainer = SoupStrainer("div",
                                            {"id": "uiScheduleTabContent"})
                    soup = BeautifulSoup(data, parse_only=strainer)
                    for row in soup.find_all("li", {"class": "list"}):
                        _prog = EPGProgram(_ch.id)
                        _prog.stime = datetime.strptime(
                            f"{str(day)} {row.find('p', {'class': 'time'}).text}",
                            "%Y-%m-%d %H:%M")
                        for itag in row.select('i[class="hide"]'):
                            itxt = itag.text.strip()
                            if "세 이상" in itxt:
                                _prog.rating = int(
                                    itxt.replace("세 이상", "").strip())
                            else:
                                _prog.extras.append(itxt)
                        cell = row.find("p", {"class": "cont"})
                        if cell:
                            if cell.find("span"):
                                cell.span.decompose()
                            _prog.title = cell.text.strip()
                            matches = re.match(self.title_regex, _prog.title)
                            if matches:
                                _prog.title = matches.group(1) or ""
                                _prog.title_sub = matches.group(5) or ""
                                _prog.rebroadcast = bool(matches.group(7))
                                _prog.ep_num = matches.group(3) or ""
                            _ch.programs.append(_prog)
                except Exception:
                    log.exception("파싱 에러: %s", _ch)
            if not lazy_write:
                _ch.to_xml(self.cfg, no_endtime=self.no_endtime)

Пример #7

0

Показать файл

Файл: lg.py Проект: epg2xml/epg2xml

    def get_programs(self, lazy_write=False):
        max_ndays = 5
        if int(self.cfg["FETCH_LIMIT"]) > max_ndays:
            log.warning(
                """

***********************************************************************

%s는 당일포함 %d일치만 EPG를 제공하고 있습니다.

***********************************************************************
            """,
                self.provider_name,
                max_ndays,
            )
        url = "http://www.uplus.co.kr/css/chgi/chgi/RetrieveTvSchedule.hpi"
        params = {"chnlCd": "SVCID", "evntCmpYmd": "EPGDATE"}
        for idx, _ch in enumerate(self.req_channels):
            log.info("%03d/%03d %s", idx + 1, len(self.req_channels), _ch)
            for nd in range(min(int(self.cfg["FETCH_LIMIT"]), max_ndays)):
                day = date.today() + timedelta(days=nd)
                params.update({
                    "chnlCd": _ch.svcid,
                    "evntCmpYmd": day.strftime("%Y%m%d")
                })
                try:
                    data = self.request(url,
                                        params,
                                        method="POST",
                                        output="html")
                    data = data.replace("<재>", "&lt;재&gt;").replace(
                        " [..", "").replace(" (..", "")
                    soup = BeautifulSoup(data,
                                         parse_only=SoupStrainer("table"))
                    if not str(soup):
                        log.warning("EPG 정보가 없거나 없는 채널입니다: %s", _ch)
                        # 오늘 없으면 내일도 없는 채널로 간주
                        break
                    for row in soup.find("table").tbody.find_all("tr"):
                        cell = row.find_all("td")
                        _prog = EPGProgram(_ch.id)
                        _prog.stime = datetime.strptime(
                            f"{str(day)} {cell[0].text}", "%Y-%m-%d %H:%M")
                        for span in cell[1].select("span > span[class]"):
                            span_txt = span.text.strip()
                            if "cte_all" in span["class"]:
                                _prog.rating = 0 if span_txt == "All" else int(
                                    span_txt)
                            else:
                                _prog.extras.append(span_txt)
                        cell[1].find("span", {"class": "tagGroup"}).decompose()
                        _prog.title = cell[1].text.strip()
                        matches = re.match(self.title_regex, _prog.title)
                        if matches:
                            _prog.title = (matches.group(1) or "").strip()
                            _prog.title_sub = (matches.group(2) or "").strip()
                            _prog.ep_num = matches.group(3) or ""
                            _prog.rebroadcast = bool(matches.group(4))
                        _prog.category = cell[2].text.strip()
                        _ch.programs.append(_prog)
                except Exception:
                    log.exception("파싱 에러: %s", _ch)
            if not lazy_write:
                _ch.to_xml(self.cfg, no_endtime=self.no_endtime)

Python ParserBeautifulSoup примеры использования