def save_shaped_race_info(self, shaped_race_info_list: List[ShapedRaceData]): # shaped_race_data を dataframeに変換する df_data = self.formatter.data_to_df(shaped_race_info_list) #logger.info(df_data) if df_data.shape[0] == 0: logger.info("no data to save.") return current_date_ymd = self.current_datetime.strftime("%Y-%m-%d") current_time = self.current_datetime.now().time().strftime("%H%M%S") # TODO 外だししたい os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = r'/Users/daikimiyazaki/.config/pndnism-project-fc40cb799b41.json' os.makedirs( f"./horse_info_crawler/race/data/race_histories/{current_date_ymd}", exist_ok=True) client = storage.Client() bucket = client.get_bucket('pndnism_horse_data') save_path = f"./horse_info_crawler/race/data/race_histories/{current_date_ymd}/shaped_race_history_{current_time}.csv" cs_save_path = f"race/data/race_histories/{current_date_ymd}/shaped_race_history_{current_time}.csv" # データフレームを CSV として ローカルに保存する df_data.to_csv(save_path, index=False) bucket.blob(cs_save_path).upload_from_string(df_data.to_csv(), 'text/csv')
def get(self, listing_page_url: str) -> ListingPage: # listing_page_url が相対パスだったら絶対パスに変換する listing_page_absolute_url = urllib.parse.urljoin( NETKEIBA_BASE_URL, listing_page_url) logger.info(f"Accessing to {listing_page_absolute_url}.") response = requests.get(listing_page_absolute_url) response.raise_for_status() return self.parser.parse(response.content)
def exec(self, crawl_limit: Optional[int] = None): logger.info(f'Start crawl_race_histories. crawl_limit: {crawl_limit}') race_histories = self._crawl_race_histories(crawl_limit) # race_histories を CSV に変換して ローカルに保存する self.race_info_repository.save_shaped_race_info( self._shape_race_infos(race_histories)) logger.info("End crawl_race_histories.")
def get(self, race_info_page_url: str) -> RaceInfo: # race_info_parser が相対パスだったら絶対パスに変換数 race_info_page_absolute_url = urllib.parse.urljoin( NETKEIBA_BASE_URL, race_info_page_url) logger.info(f"Accessing to {race_info_page_absolute_url}.") response = requests.get(race_info_page_absolute_url) response.raise_for_status() return self.parser.parse(response.content)
def exec(self, crawl_limit: Optional[int] = None): logger.info(f'Start crawl_horse_info. crawl_limit: {crawl_limit}') horse_info = self._crawl_horse_info(crawl_limit) # horse_histories を CSV に変換して ローカルに保存する self.horse_info_repository.save_shaped_horse_info( self._shape_horse_infos(horse_info)) logger.info("End crawl_horse_info.")
def _shape_race_infos( self, race_info_list: List[RaceInfo]) -> List[ShapedRaceData]: shaped_race_info_list = [] for race_info in race_info_list: try: # Error が発生したら該当 RaceInfo は Skip する shaped_race_info_list.append(self._shape_race_info(race_info)) except UnsupportedFormatError as e: logger.info(f"Skip getting race info:{e}") # TODO: エラー検知をsentryとかで実装する except InvalidFormatError as e: logger.warning(f"Skip getting race info:{e}") # TODO: エラー検知をsentryとかで実装する return shaped_race_info_list
def _shape_horse_infos( self, horse_info_list: List[HorseInfo]) -> List[ShapedHorseInfo]: shaped_horse_info_list = [] for horse_info in horse_info_list: try: # Error が発生したら該当 HorseInfo は Skip する shaped_horse_info_list.append( self._shape_horse_info(horse_info)) except UnsupportedFormatError as e: logger.info(f"Skip getting horse info:{e}") # TODO: エラー検知をsentryとかで実装する except InvalidFormatError as e: logger.warning(f"Skip getting horse info:{e}") # TODO: エラー検知をsentryとかで実装する return shaped_horse_info_list
def _crawl_horse_info(self, crawl_limit: Optional[int] = None ) -> List[HorseInfo]: horse_info = [] crawl_end_flg = False crawled_urls = self._check_crawled_urls() # リスティングページをクロールして物件詳細の URL 一覧を取得する listing_page_url = self.horse_info_listing_page_scraper.LISTING_PAGE_START_URLS count = 0 while listing_page_url: listing_page = self.horse_info_listing_page_scraper.get( listing_page_url) for_log = listing_page_url[:20] + "~" + listing_page_url[-20:] logger.info( f"listing_page_url: {for_log}, horse_info_page_urls count: {len(listing_page.horse_info_page_urls)}" ) # レース詳細ページにアクセスして、レースのデータを取得する for horse_info_page_url in listing_page.horse_info_page_urls: # CSV にアップロードするデータ構造をいれる # Errorが発生したら該当PropertyはSkipする count += 1 if NETKEIBA_BASE_URL[:-1] + horse_info_page_url in crawled_urls: logger.info("already crawled. skip...") crawl_end_flg = True break try: if self._get_horse_info(horse_info_page_url): horse_info.append( self._get_horse_info(horse_info_page_url)) else: raise DetailPageNotFoundError("table not found.") except DetailPageNotFoundError as e: logger.warning(f"Skip getting horse:{e}") # TODO: sentryとかエラー監視ツール入れる #if count == 100: #logger.info("10sec crawler idling... ") #time.sleep(10) # count = 0 if crawl_limit and len(horse_info) >= crawl_limit: # crawl_limit の件数に達したらクロールを終了する logger.info( f"Finish crawl. horse_histories count: {len(horse_info)}" ) return horse_info # next_page_url がある場合は次ページへアクセス if crawl_end_flg: break print(listing_page.next_page_url) listing_page_url = listing_page.next_page_url logger.info(f"Finish crawl. horse_histories count: {len(horse_info)}") return horse_info
def get(self, horse_info_page_url: str) -> HorseInfo: #args = ['sudo', 'service', 'tor','restart'] #subprocess.call(args) #socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050) #socket.socket = socks.socksocket #proxies = { #'http':'socks5://127.0.0.1:9050', #'https':'socks5://127.0.0.1:9050' #} # horse_info_parser が相対パスだったら絶対パスに変換数 horse_info_page_absolute_url = urllib.parse.urljoin( NETKEIBA_BASE_URL, horse_info_page_url) logger.info(f"Accessing to {horse_info_page_absolute_url}.") response = requests.get(horse_info_page_absolute_url) #print(response) response.raise_for_status() return self.parser.parse(response.content, horse_info_page_absolute_url)
def _crawl_race_histories(self, crawl_limit: Optional[int] = None ) -> List[RaceInfo]: race_histories = [] crawl_end_flg = False crawled_urls = self._check_crawled_urls() # リスティングページをクロールして物件詳細の URL 一覧を取得する listing_page_url = self.race_info_listing_page_scraper.LISTING_PAGE_START_URLS while listing_page_url: listing_page = self.race_info_listing_page_scraper.get( listing_page_url) for_log = listing_page_url[:20] + "~" + listing_page_url[-20:] logger.info( f"listing_page_url: {for_log}, race_info_page_urls count: {len(listing_page.race_info_page_urls)}" ) # レース詳細ページにアクセスして、レースのデータを取得する for race_info_page_url in listing_page.race_info_page_urls: # CSV にアップロードするデータ構造をいれる # Errorが発生したら該当PropertyはSkipする if NETKEIBA_BASE_URL[:-1] + race_info_page_url in crawled_urls: logger.info("already crawled. skip...") #crawl_end_flg = True #break try: if self._get_race_info(race_info_page_url): race_histories.append( self._get_race_info(race_info_page_url)) else: raise DetailPageNotFoundError("table not found.") except DetailPageNotFoundError as e: logger.warning(f"Skip getting race:{e}") # TODO: sentryとかエラー監視ツール入れる if crawl_limit and len(race_histories) >= crawl_limit: # crawl_limit の件数に達したらクロールを終了する logger.info( f"Finish crawl. race_histories count: {len(race_histories)}" ) return race_histories # next_page_url がある場合は次ページへアクセス if crawl_end_flg: break print(listing_page.next_page_url) listing_page_url = listing_page.next_page_url logger.info( f"Finish crawl. race_histories count: {len(race_histories)}") return race_histories
def parse(self, html, url) -> HorseInfo: soup = BeautifulSoup(html, "lxml") if len(soup.find_all("table", summary="のプロフィール")) != 0: profile_table = soup.find_all("table", summary="のプロフィール")[0] else: logger.info("crawling by selenium...") options = webdriver.ChromeOptions() options.add_argument('--headless') driver = webdriver.Chrome( options=options, executable_path= '/Users/daikimiyazaki/workspace/pndnism/horse_race_prediction/horse_info_crawler/horse_info_crawler/components/chromedriver', ) driver.get(url) content = driver.page_source soup = BeautifulSoup(content, "lxml") profile_table = soup.find_all("table", summary="のプロフィール")[0] profile_dic = {} for i, j in zip(profile_table.find_all("th"), profile_table.find_all("td")): profile_dic[i.text] = j.text blood_table = soup.find_all("dd", class_="DB_ProfHead_dd_01")[0] return HorseInfo( horse_url=self._parse_horse_url(soup), name=self._parse_name(soup), birthday=self._parse_birthday(profile_dic), trainer_name=self._parse_trainer_name(profile_dic), owner_name=self._parse_owner_name(profile_dic), producer=self._parse_producer(profile_dic), origin_place=self._parse_origin_place(profile_dic), mother=self._parse_mother(blood_table), father=self._parse_father(blood_table), mother_of_father=self._parse_mother_of_father(blood_table), father_of_father=self._parse_father_of_father(blood_table), mother_of_mother=self._parse_mother_of_mother(blood_table), father_of_mother=self._parse_father_of_mother(blood_table))