def CrawActor(self, curr_actor_url): ''' :param img_url: the url contain the image of the actor or the actress :return: ''' try: source_code = requests.get(curr_actor_url) plain_text = source_code.text soup = BeautifulSoup(plain_text, "html.parser") link = soup.find('img', {'id': 'name-poster'}) if link: # make sure the link is valid, which means this celebrity has a picture img_url = link.get('src') actor_name = link.get('alt').replace(' Picture', '') born_info = soup.find('div', {'id': 'name-born-info'}) curr_actor_info = people_info.PeopleInfo() curr_actor_info.SetName(actor_name) curr_actor_info.SetLink(curr_actor_url) birth_day = "" birth_year = "" location = "" if born_info is not None: born_info_list = born_info.findAll('a') for link in born_info_list: curr_href = link.get("href") if curr_href.find("birth_monthday") != -1: birth_day = link.text elif curr_href.find("birth_year") != -1: birth_year = link.text elif curr_href.find("birth_place") != -1: location = link.text curr_actor_info.SetBirthYear(birth_year) curr_actor_info.SetBirthDay(birth_day) curr_actor_info.SetLocation(location) with self.file_lock_: curr_actor_info.JsonDump(self.json_file_) url2img.url2img('../data/IMDB', actor_name, img_url) # if len(self.actor_queue_) < self.max_actor_queue_length_: # if the current queue is larger than the defined maximum length, # the program would not adding new link ustil it reduce to the maximum length node = soup.find_all('div', {'class': 'filmo-category-section'}) for item in node: films_class = item.find_all('b') for single_movie in films_class: if self.close_thread_call_: return self.sleep_event.wait() # in case the name of the movie is unprintable try: print(single_movie.text) finally: pass movie_url = self.host_name_ + (single_movie.contents[0].get('href')) movie_url = self.RemoveQuestionMark(movie_url) print(movie_url) #with self.actor_queue_lock_: self.actor_queue_.append(movie_url) #self.CrawMovie(movie_url) except: print("some error encountered")