def parse_html(self): page = default_cache.get_page(self.get_real_url()) self.tree = html.fromstring(page) self.summary = self.get_value_from_tree(AO3_SUMMARY_FINDER) self.title = self.get_value_from_tree(AO3_TITLE) self.author = self.get_value_from_tree(AO3_AUTHOR_NAME) self.authorlink = self.get_value_from_tree(AO3_AUTHOR_URL)
def parse_html(self): page = default_cache.get_page(self.get_real_url()) self.tree = html.fromstring(page) self.summary = self.get_value_from_tree(AO3_SUMMARY_FINDER) self.title = self.get_value_from_tree(AO3_TITLE) self.author = self.get_value_from_tree(AO3_AUTHOR_NAME) self.authorlink = self.get_value_from_tree(AO3_AUTHOR_URL) self.stats = AO3Metadata( AO3_LINK_REGEX.match(self.url).groupdict()["sid"], self.tree)
def parse_html(self): page = default_cache.get_page(self.get_real_url()) self.tree = html.fromstring(page) self.summary = self.get_value_from_tree(AO3_SUMMARY_FINDER) self.title = self.get_value_from_tree(AO3_TITLE) self.author = self.get_value_from_tree(AO3_AUTHOR_NAME) self.authorlink = "https://www.archiveofourown.org" + self.get_value_from_tree( AO3_AUTHOR_URL) self.stats = AO3Metadata( AO3_LINK_REGEX.match(self.url).groupdict()["sid"], self.tree)
def parse_html(self): self.tree = tree = html.fromstring(default_cache.get_page(self.url)) self.summary_and_meta = ' '.join(tree.xpath(FFA_SUMMARY_AND_META)) self.summary = ''.join( re.findall('Summary: (.*?)(?=Rated:)', self.summary_and_meta, re.DOTALL)).replace("\n", " ").strip() self.stats = FFAMetadata( str(FFA_LINK_REGEX.match(self.url).groupdict()["sid"]), self.tree) self.title = tree.xpath(FFA_TITLE)[0] self.author = tree.xpath(FFA_AUTHOR_NAME)[0] self.authorlink = 'http://www.hpfanficarchive.com/stories/' + \ tree.xpath(FFA_AUTHOR_URL)[0]
def parse_html(self): tree = html.fromstring(default_cache.get_page(self.url)) self.summary_and_meta = ''.join(tree.xpath(FFA_SUMMARY_AND_META)) self.summary = ''.join(re.findall('Summary: (.*?)\n', self.summary_and_meta)) self.make_stats() self.title = tree.xpath(FFA_TITLE)[0] self.author = tree.xpath(FFA_AUTHOR_NAME)[0] self.authorlink = 'http://www.hpfanficarchive.com/stories/' + \ tree.xpath(FFA_AUTHOR_URL)[0] print(self.summary) print(self.stats)
def parse_html(self): tree = html.fromstring( default_cache.get_page( self.get_url(), # Got this header from the ficsave codebase headers={"Cookie": AFF_BYPASS_COOKIE }, # Do not even try to follow to the adult form url. allow_redirects=False)) self.tree = tree # We will generate the stats ourselves. self.stats = AFFMetadata((self.archive, self.id), tree) self.title = tree.xpath(AFF_TITLE_XPATH)[0].strip()[len("Story: "):] self.author = tree.xpath(AFF_AUTHOR_NAME)[0].strip() self.authorlink = tree.xpath(AFF_AUTHOR_URL)[0]
def parse_html(self): page = default_cache.get_page( self.get_url(), throttle=randint(1000, 4000) / 1000) tree = html.fromstring(page) self.title = (tree.xpath('//*[@id="profile_top"]/b/text()'))[0] self.summary = (tree.xpath('//*[@id="profile_top"]/div/text()'))[0 ] self.author += (tree.xpath('//*[@id="profile_top"]/a[1]/text()'))[ 0 ] self.authorlink = 'https://www.' + self.site + tree.xpath( '//*[@id="profile_top"]/a[1]/@href')[0] self.image = tree.xpath('//*[@id="profile_top"]/span[1]/img') self.tree = tree self.stats = self.parser(None, tree)
def parse_html(self): tree = html.fromstring(default_cache.get_page( self.get_url(), # Got this header from the ficsave codebase headers={ "Cookie": AFF_BYPASS_COOKIE }, # Do not even try to follow to the adult form url. allow_redirects=False)) self.tree = tree # We will generate the stats ourselves. self.stats = AFFMetadata((self.archive, self.id), tree) self.title = tree.xpath(AFF_TITLE_XPATH)[0].strip()[ len("Story: "): ] self.author = tree.xpath(AFF_AUTHOR_NAME)[0].strip() self.authorlink = tree.xpath(AFF_AUTHOR_URL)[0]
def parse_html(self): page = default_cache.get_page( self.get_url(), throttle=randint(1000, 4000) / 1000) tree = html.fromstring(page) self.title = (tree.xpath('//*[@id="profile_top"]/b/text()')) if not len(self.title): raise site.StoryDoesNotExist self.title = self.title[0] self.summary = (tree.xpath('//*[@id="profile_top"]/div/text()'))[0 ] self.author += (tree.xpath('//*[@id="profile_top"]/a[1]/text()'))[ 0 ] self.authorlink = 'https://www.' + self.site + tree.xpath( '//*[@id="profile_top"]/a[1]/@href')[0] self.image = tree.xpath('//*[@id="profile_top"]/span[1]/img') self.tree = tree self.stats = self.parser(None, tree)
def parse_html(self): self.tree = tree = html.fromstring(default_cache.get_page(self.url)) self.summary_and_meta = ' '.join(tree.xpath(SIYE_SUMMARY_AND_META)) self.summary = ''.join( re.findall( 'Summary: (.*?)(?=Hitcount:)', self.summary_and_meta, re.DOTALL ) ).replace("\n", " ").strip() self.stats = SIYEMetadata( str(SIYE_LINK_REGEX.match(self.url).groupdict()["sid"]), self.tree ) self.title = tree.xpath(SIYE_TITLE_AUTHOR_NAME)[0] self.author = tree.xpath(SIYE_TITLE_AUTHOR_NAME)[2] self.authorlink = 'http://www.siye.co.uk/' + \ tree.xpath(SIYE_AUTHOR_URL)[0]
def parse_html(self): self.tree = tree = html.fromstring(default_cache.get_page(self.url)) self.summary_and_meta = ' '.join(tree.xpath(FFA_SUMMARY_AND_META)) self.summary = ''.join( re.findall( 'Summary: (.*?)(?=Rated:)', self.summary_and_meta, re.DOTALL ) ).replace("\n", " ").strip() self.stats = FFAMetadata( str(FFA_LINK_REGEX.match(self.url).groupdict()["sid"]), self.tree ) self.title = tree.xpath(FFA_TITLE)[0] self.author = tree.xpath(FFA_AUTHOR_NAME)[0] self.authorlink = 'http://www.hpfanficarchive.com/stories/' + \ tree.xpath(FFA_AUTHOR_URL)[0]
def parse_html(self): page = default_cache.get_page(self.get_url(), throttle=randint(1000, 4000) / 1000) tree = html.fromstring(page) self.title = (tree.xpath('//*[@id="profile_top"]/b/text()'))[0] self.summary = (tree.xpath('//*[@id="profile_top"]/div/text()'))[0] self.author += (tree.xpath('//*[@id="profile_top"]/a[1]/text()'))[0] self.authorlink = 'https://www.' + self.site + \ tree.xpath('//*[@id="profile_top"]/a[1]/@href')[0] self.image = tree.xpath('//*[@id="profile_top"]/span[1]/img') self.raw_stats = [] self.raw_stats.extend( tree.xpath('//*[@id="pre_story_links"]/span/a[last()]/text()')) self.raw_stats.extend(['\n']) # XPath changes depending on the presence of an image if len(self.image) is not 0: self.raw_stats.extend(tree.xpath('//*[@id="profile_top"]/span[4]//text()')) else: self.raw_stats.extend(tree.xpath('//*[@id="profile_top"]/span[3]//text()'))
def parse_html(self): tree = html.fromstring(default_cache.get_page( self.get_url(), # Got this header from the ficsave codebase headers={ "Cookie": AFF_BYPASS_COOKIE }, # Do not even try to follow to the adult form url. allow_redirects=False )) # We will generate the stats ourselves. self.stats = " - ".join(( (title + ": " + str(result( tree, self.archive, self.id ))) for title, result in AFF_GENERATED_META )) self.title = tree.xpath(AFF_TITLE_XPATH)[0].strip()[len("Story: "):] self.author = tree.xpath(AFF_AUTHOR_NAME)[0].strip() self.authorlink = tree.xpath(AFF_AUTHOR_URL)[0]