def pull(self) -> NatalieArticle: category = ",".join( list( map( lambda x: x.text, self.content.find( "ul", {"class", "NA_breadcrumb"}).findAll("li")))[:-1]) title = self.content.find("div", { "class": "NA_articleHeader" }).find("h1").text update_datetime = self.content.find("p", { "class": "NA_date" }).find("time").text star = "0" try: star = self.content.find("p", {"class": "NA_res2"}).find("a").text except Exception: logging.info(f"{self.url} star not found") thumbnail = "https://s3-ap-northeast-1.amazonaws.com/lifull-homes-press/uploads/press/2019/09/14-300x225.jpg" try: thumbnail = self.content.find("p", { "class": "NA_figure" }).find("img").get("src") except Exception: logging.info(f"{self.url} thumbnail not found") content = self.content.find("div", { "class": "NA_articleBody" }).text.replace(",", "").replace("\n", "").replace("\"", "") return NatalieArticle(url=self.url, category=category, title=title, update_datetime=update_datetime, star=star, thumbnail=thumbnail, content=content)
def save_all_category_as_json(self, output_path: str): dj = self.pull_all_category() json = dj.to_json(indent=4, ensure_ascii=False) full_output_path = f"{output_path}/{self.base_file_name}_{get_today_date()}.json" logging.info( f"save_all_category_as_json: output to {full_output_path}") with open(full_output_path, "w") as f: f.write(json)
def get_page(self, key=""): """[https://tabelog.com/key/rank/にアクセス] Returns: [type] -- [description] """ if key == "": url = f"{self.base_url}/rank" else: url = f"{self.base_url}/{key}/rank" logging.info(f"request to {url}") response = self.request.get(url=url, response_content_type="html") return response
def _parse_article(article): try: title = article.find("dt", { "class": "NA_title" }).text.replace("\n", "").replace(",", "").replace("\"", "") detail_link = article.find("a").get("href") article = NatalieArticleSite(request=self.request, url=detail_link).pull() return article except Exception: logging.error( f"_parse_article error occured : {sys.exc_info()}") return None
def category_ranking_menu(self): """[ランキング一覧(カテゴリ別)を取得] Returns: [type] -- [description] """ li = self.get_page().content.find("ul", {"class", "rank-level2"}).findAll( "li", {"class", "level2"}) result = [] result = list( map(lambda i: (i.text.replace("\n", ""), i.find("a").get("href")), li)) result.append(("総合", f"{self.base_url}/rank")) logging.info(result) return result
def _to_result(mecab_result: str): try: _w1 = mecab_result.split("\t") _w2 = _w1[1].split(",") word = _w1[0] word_type = _w2[0] word_kana = _w2[-1] return MecabResult(sencence=sentence, word=word, word_type=word_type, word_kana=word_kana) except Exception: logging.info(f"CatsMeCab parse: {mecab_result} cannot parse") return MecabResult(sencence="", word="", word_type="", word_kana="")
def pr_comment(self): pr_comment_wrap = self.soup.find("div", {"class": "pr-comment-wrap"}) if pr_comment_wrap != None: pr_comment_title = pr_comment_wrap.find("h3", { "class": "pr-comment-title" }).text pr_comment_first = pr_comment_wrap.find( "span", { "class": "pr-comment__first" }).text else: logging.warn(f"{self.url} comment_wrap is None") pr_comment_title = "" pr_comment_first = "" return { "pr_comment_title": pr_comment_title, "pr_comment_first": pr_comment_first }
def pull_ranking(self, url) -> GunosyRanking: page = self.request.get(url=url, response_content_type="html").content ranking_title = page.find("h1", {"class", "list_header_title"}).text logging.info(f"pull_ranking: {ranking_title}") def _parge_page(list_content): try: list_thumb = list_content.find("div", {"class", "list_thumb"}) detail_url = list_thumb.find("a").get("href") thumb_url = "https://" + list_thumb.find("img").get("style").split("(//")[1].replace(")","") article = GunosyArticleSite(request=self.request, url=detail_url).pull() runk_num = list_content.find("span", {"class", "list_rank_no"}).text return GunosyRanknedArticle( thumb_url=thumb_url, detail_url=detail_url, runk_num=runk_num, article=article) except: print(f"{list_content} page cannot parse") return None list_contents = page.findAll("div", {"class", "list_content"}) articles = list(filter(lambda x: x != None, list(map(lambda x: _parge_page(x), list_contents)))) return GunosyRanking(ranking_title=ranking_title, articles=articles)
def program_list(cls, api_key, area="130", service="g1", date=None) -> List[NHKProgram]: request = CatsRequest() with CatsRequest() as request: if date == None: date = get_today_date(split="-") try: api_url = f"http://api.nhk.or.jp/v2/pg/list/130/g1/{date}.json?key={api_key}" logging.info(f"api_url: {api_url}",sys._getframe().f_code.co_name) json = request.get(url=api_url, response_content_type="json").content logging.debug(f"response: {json}",sys._getframe().f_code.co_name) result = NHKProgramTable.parse_response(response_json=json) return result except Exception: sys.exc_info() logging.error(f"{sys._getframe().f_code.co_name} was failed. {sys.exc_info()}")
def save_program_list_as_csv(cls, output_dir: str, api_key, area="130", service="g1", date=None): result = cls.program_list(api_key, area, service, date) df = PandasConverter.dataclass_to_dataframe(result) csv_path = NHKProgramTable.csv_path(output_dir=output_dir) logging.info(f"{sys._getframe().f_code.co_name} output to {csv_path}") df.to_csv(csv_path)