def __init__(self, url): super(newpct_page, self).__init__() self.url = url self.bs4 = bs4(url) self.id = url.__hash__() self.get_info() self.bs4 = None
def get_links_pagination(self, url): this_bs4 = bs4(url) pre_links = [] for link in this_bs4.soup.find_all( "ul", class_="buscar-list")[0].findAll('a', href=True): pre_links.append(link['href'].encode('utf-8')) return pre_links
def __init__(self, url): self.url = url self.bs4 = bs4(url) id = re.findall('\d+', url) self.id = id[0] self.get_info() self.bs4 = None super(filmaffinity_page, self).__init__()
def get_links_vo(url): return_link_list = [] this_bs4 = bs4(url) for link in this_bs4.get_all_links(): if any(re.findall(tag_vo, link['href'])): if any(re.findall(pagination_vo, link['href'])) and not any( re.findall(pagination_vo, url)): return_link_list += get_links_vo(link['href']) else: if not any(re.findall(pagination_vo, link['href'])): return_link_list.append(link['href']) return return_link_list
def __init__(self, url): self.url = url self.bs4 = bs4(url) self.id = url.__hash__() file_path = os.path.abspath( os.path.join(os.path.join(os.path.dirname(__file__)), os.pardir)) + '/state/newpct_series/%s.json' % self.id self.json = self.readJSONfile(file_path) self.get_info() self.bs4 = None
def get_links_hd(url): return_link_list = [] this_bs4 = bs4(url) for link in this_bs4.get_all_links(): if any(re.findall(tag_hd, link['href'])): if any(re.findall(pagination_hd, link['href'])) and not any( re.findall(pagination_hd, url)): return_link_list += get_links_hd(link['href']) else: if not any(re.findall(pagination_hd, link['href'])): return_link_list.append(link['href']) #return_link_list = ['http://www.newpct1.com/series-hd/anatomia-de-grey/2259'] return return_link_list