示例#1
0
 def pull(self) -> NatalieArticle:
     category = ",".join(
         list(
             map(
                 lambda x: x.text,
                 self.content.find(
                     "ul", {"class", "NA_breadcrumb"}).findAll("li")))[:-1])
     title = self.content.find("div", {
         "class": "NA_articleHeader"
     }).find("h1").text
     update_datetime = self.content.find("p", {
         "class": "NA_date"
     }).find("time").text
     star = "0"
     try:
         star = self.content.find("p", {"class": "NA_res2"}).find("a").text
     except Exception:
         logging.info(f"{self.url} star not found")
     thumbnail = "https://s3-ap-northeast-1.amazonaws.com/lifull-homes-press/uploads/press/2019/09/14-300x225.jpg"
     try:
         thumbnail = self.content.find("p", {
             "class": "NA_figure"
         }).find("img").get("src")
     except Exception:
         logging.info(f"{self.url} thumbnail not found")
     content = self.content.find("div", {
         "class": "NA_articleBody"
     }).text.replace(",", "").replace("\n", "").replace("\"", "")
     return NatalieArticle(url=self.url,
                           category=category,
                           title=title,
                           update_datetime=update_datetime,
                           star=star,
                           thumbnail=thumbnail,
                           content=content)
示例#2
0
 def save_all_category_as_json(self, output_path: str):
     dj = self.pull_all_category()
     json = dj.to_json(indent=4, ensure_ascii=False)
     full_output_path = f"{output_path}/{self.base_file_name}_{get_today_date()}.json"
     logging.info(
         f"save_all_category_as_json: output to {full_output_path}")
     with open(full_output_path, "w") as f:
         f.write(json)
示例#3
0
 def get_page(self, key=""):
     """[https://tabelog.com/key/rank/にアクセス]
     
     Returns:
         [type] -- [description]
     """
     if key == "":
         url = f"{self.base_url}/rank"
     else:
         url = f"{self.base_url}/{key}/rank"
     logging.info(f"request to {url}")
     response = self.request.get(url=url, response_content_type="html")
     return response
示例#4
0
 def _parse_article(article):
     try:
         title = article.find("dt", {
             "class": "NA_title"
         }).text.replace("\n", "").replace(",",
                                           "").replace("\"", "")
         detail_link = article.find("a").get("href")
         article = NatalieArticleSite(request=self.request,
                                      url=detail_link).pull()
         return article
     except Exception:
         logging.error(
             f"_parse_article error occured : {sys.exc_info()}")
         return None
示例#5
0
 def category_ranking_menu(self):
     """[ランキング一覧(カテゴリ別)を取得]
     
     Returns:
         [type] -- [description]
     """
     li = self.get_page().content.find("ul",
                                       {"class", "rank-level2"}).findAll(
                                           "li", {"class", "level2"})
     result = []
     result = list(
         map(lambda i: (i.text.replace("\n", ""), i.find("a").get("href")),
             li))
     result.append(("総合", f"{self.base_url}/rank"))
     logging.info(result)
     return result
示例#6
0
 def _to_result(mecab_result: str):
     try:
         _w1 = mecab_result.split("\t")
         _w2 = _w1[1].split(",")
         word = _w1[0]
         word_type = _w2[0]
         word_kana = _w2[-1]
         return MecabResult(sencence=sentence,
                            word=word,
                            word_type=word_type,
                            word_kana=word_kana)
     except Exception:
         logging.info(f"CatsMeCab parse: {mecab_result} cannot parse")
         return MecabResult(sencence="",
                            word="",
                            word_type="",
                            word_kana="")
示例#7
0
 def pr_comment(self):
     pr_comment_wrap = self.soup.find("div", {"class": "pr-comment-wrap"})
     if pr_comment_wrap != None:
         pr_comment_title = pr_comment_wrap.find("h3", {
             "class": "pr-comment-title"
         }).text
         pr_comment_first = pr_comment_wrap.find(
             "span", {
                 "class": "pr-comment__first"
             }).text
     else:
         logging.warn(f"{self.url} comment_wrap is None")
         pr_comment_title = ""
         pr_comment_first = ""
     return {
         "pr_comment_title": pr_comment_title,
         "pr_comment_first": pr_comment_first
     }
示例#8
0
 def pull_ranking(self, url) -> GunosyRanking:
     page = self.request.get(url=url, response_content_type="html").content
     ranking_title = page.find("h1", {"class", "list_header_title"}).text
     logging.info(f"pull_ranking: {ranking_title}")
     def _parge_page(list_content):
         try:
             list_thumb = list_content.find("div", {"class", "list_thumb"})
             detail_url = list_thumb.find("a").get("href")
             thumb_url = "https://" + list_thumb.find("img").get("style").split("(//")[1].replace(")","")
             article = GunosyArticleSite(request=self.request, url=detail_url).pull()
             runk_num = list_content.find("span", {"class", "list_rank_no"}).text
             return GunosyRanknedArticle(
                 thumb_url=thumb_url,
                 detail_url=detail_url,
                 runk_num=runk_num,
                 article=article)
         except:
             print(f"{list_content} page cannot parse")
             return None
     list_contents = page.findAll("div", {"class", "list_content"})
     articles = list(filter(lambda x: x != None, list(map(lambda x: _parge_page(x), list_contents))))
     return GunosyRanking(ranking_title=ranking_title, articles=articles)
示例#9
0
 def program_list(cls, api_key, area="130", service="g1", date=None) -> List[NHKProgram]:
     request = CatsRequest()
     with CatsRequest() as request:
         if date == None:
             date = get_today_date(split="-")
         try:
             api_url = f"http://api.nhk.or.jp/v2/pg/list/130/g1/{date}.json?key={api_key}"
             logging.info(f"api_url: {api_url}",sys._getframe().f_code.co_name)
             json = request.get(url=api_url, response_content_type="json").content
             logging.debug(f"response: {json}",sys._getframe().f_code.co_name)
             result = NHKProgramTable.parse_response(response_json=json)
             return result
         except Exception:
             sys.exc_info()
             logging.error(f"{sys._getframe().f_code.co_name} was failed. {sys.exc_info()}")
示例#10
0
 def save_program_list_as_csv(cls, output_dir: str, api_key, area="130", service="g1", date=None):
     result = cls.program_list(api_key, area, service, date)
     df = PandasConverter.dataclass_to_dataframe(result)
     csv_path = NHKProgramTable.csv_path(output_dir=output_dir)
     logging.info(f"{sys._getframe().f_code.co_name} output to {csv_path}")
     df.to_csv(csv_path)