示例#1
0
    def parse_html(self):
        page = default_cache.get_page(self.get_real_url())
        self.tree = html.fromstring(page)

        self.summary = self.get_value_from_tree(AO3_SUMMARY_FINDER)
        self.title = self.get_value_from_tree(AO3_TITLE)
        self.author = self.get_value_from_tree(AO3_AUTHOR_NAME)
        self.authorlink = self.get_value_from_tree(AO3_AUTHOR_URL)
示例#2
0
    def parse_html(self):
        page = default_cache.get_page(self.get_real_url())
        self.tree = html.fromstring(page)

        self.summary = self.get_value_from_tree(AO3_SUMMARY_FINDER)
        self.title = self.get_value_from_tree(AO3_TITLE)
        self.author = self.get_value_from_tree(AO3_AUTHOR_NAME)
        self.authorlink = self.get_value_from_tree(AO3_AUTHOR_URL)
        self.stats = AO3Metadata(
            AO3_LINK_REGEX.match(self.url).groupdict()["sid"], self.tree)
示例#3
0
 def parse_html(self):
     page = default_cache.get_page(self.get_real_url())
     self.tree = html.fromstring(page)
     self.summary = self.get_value_from_tree(AO3_SUMMARY_FINDER)
     self.title = self.get_value_from_tree(AO3_TITLE)
     self.author = self.get_value_from_tree(AO3_AUTHOR_NAME)
     self.authorlink = "https://www.archiveofourown.org" + self.get_value_from_tree(
         AO3_AUTHOR_URL)
     self.stats = AO3Metadata(
         AO3_LINK_REGEX.match(self.url).groupdict()["sid"], self.tree)
示例#4
0
    def parse_html(self):
        self.tree = tree = html.fromstring(default_cache.get_page(self.url))

        self.summary_and_meta = ' '.join(tree.xpath(FFA_SUMMARY_AND_META))
        self.summary = ''.join(
            re.findall('Summary: (.*?)(?=Rated:)', self.summary_and_meta,
                       re.DOTALL)).replace("\n", " ").strip()
        self.stats = FFAMetadata(
            str(FFA_LINK_REGEX.match(self.url).groupdict()["sid"]), self.tree)
        self.title = tree.xpath(FFA_TITLE)[0]
        self.author = tree.xpath(FFA_AUTHOR_NAME)[0]
        self.authorlink = 'http://www.hpfanficarchive.com/stories/' + \
                          tree.xpath(FFA_AUTHOR_URL)[0]
示例#5
0
    def parse_html(self):
        tree = html.fromstring(default_cache.get_page(self.url))

        self.summary_and_meta = ''.join(tree.xpath(FFA_SUMMARY_AND_META))
        self.summary = ''.join(re.findall('Summary: (.*?)\n', self.summary_and_meta))
        self.make_stats()
        self.title = tree.xpath(FFA_TITLE)[0]
        self.author = tree.xpath(FFA_AUTHOR_NAME)[0]
        self.authorlink = 'http://www.hpfanficarchive.com/stories/' + \
            tree.xpath(FFA_AUTHOR_URL)[0]

        print(self.summary)
        print(self.stats)
示例#6
0
    def parse_html(self):
        tree = html.fromstring(
            default_cache.get_page(
                self.get_url(),  # Got this header from the ficsave codebase
                headers={"Cookie": AFF_BYPASS_COOKIE
                         },  # Do not even try to follow to the adult form url.
                allow_redirects=False))

        self.tree = tree

        # We will generate the stats ourselves.
        self.stats = AFFMetadata((self.archive, self.id), tree)
        self.title = tree.xpath(AFF_TITLE_XPATH)[0].strip()[len("Story: "):]
        self.author = tree.xpath(AFF_AUTHOR_NAME)[0].strip()
        self.authorlink = tree.xpath(AFF_AUTHOR_URL)[0]
示例#7
0
    def parse_html(self):
        page = default_cache.get_page(
            self.get_url(),
            throttle=randint(1000, 4000) / 1000)
        tree = html.fromstring(page)

        self.title = (tree.xpath('//*[@id="profile_top"]/b/text()'))[0]
        self.summary = (tree.xpath('//*[@id="profile_top"]/div/text()'))[0
                                                                        ]
        self.author += (tree.xpath('//*[@id="profile_top"]/a[1]/text()'))[
            0
        ]
        self.authorlink = 'https://www.' + self.site + tree.xpath(
            '//*[@id="profile_top"]/a[1]/@href')[0]
        self.image = tree.xpath('//*[@id="profile_top"]/span[1]/img')
        self.tree = tree
        self.stats = self.parser(None, tree)
示例#8
0
    def parse_html(self):
        tree = html.fromstring(default_cache.get_page(
            self.get_url(),  # Got this header from the ficsave codebase
            headers={
                "Cookie": AFF_BYPASS_COOKIE
            },  # Do not even try to follow to the adult form url.
            allow_redirects=False))

        self.tree = tree

        # We will generate the stats ourselves.
        self.stats = AFFMetadata((self.archive, self.id), tree)
        self.title = tree.xpath(AFF_TITLE_XPATH)[0].strip()[
            len("Story: "):
        ]
        self.author = tree.xpath(AFF_AUTHOR_NAME)[0].strip()
        self.authorlink = tree.xpath(AFF_AUTHOR_URL)[0]
示例#9
0
    def parse_html(self):
        page = default_cache.get_page(
            self.get_url(),
            throttle=randint(1000, 4000) / 1000)
        tree = html.fromstring(page)

        self.title = (tree.xpath('//*[@id="profile_top"]/b/text()'))
        if not len(self.title):
            raise site.StoryDoesNotExist
        self.title = self.title[0]
        self.summary = (tree.xpath('//*[@id="profile_top"]/div/text()'))[0
        ]
        self.author += (tree.xpath('//*[@id="profile_top"]/a[1]/text()'))[
            0
        ]
        self.authorlink = 'https://www.' + self.site + tree.xpath(
            '//*[@id="profile_top"]/a[1]/@href')[0]
        self.image = tree.xpath('//*[@id="profile_top"]/span[1]/img')
        self.tree = tree
        self.stats = self.parser(None, tree)
示例#10
0
    def parse_html(self):
        self.tree = tree = html.fromstring(default_cache.get_page(self.url))

        self.summary_and_meta = ' '.join(tree.xpath(SIYE_SUMMARY_AND_META))
        self.summary = ''.join(
            re.findall(
                'Summary: (.*?)(?=Hitcount:)',
                self.summary_and_meta,
                re.DOTALL
            )
        ).replace("\n", " ").strip()
        self.stats = SIYEMetadata(
            str(SIYE_LINK_REGEX.match(self.url).groupdict()["sid"]),
            self.tree

        )
        self.title = tree.xpath(SIYE_TITLE_AUTHOR_NAME)[0]
        self.author = tree.xpath(SIYE_TITLE_AUTHOR_NAME)[2]
        self.authorlink = 'http://www.siye.co.uk/' + \
                          tree.xpath(SIYE_AUTHOR_URL)[0]
示例#11
0
    def parse_html(self):
        self.tree = tree = html.fromstring(default_cache.get_page(self.url))

        self.summary_and_meta = ' '.join(tree.xpath(FFA_SUMMARY_AND_META))
        self.summary = ''.join(
            re.findall(
                'Summary: (.*?)(?=Rated:)',
                self.summary_and_meta,
                re.DOTALL
            )
        ).replace("\n", " ").strip()
        self.stats = FFAMetadata(
            str(FFA_LINK_REGEX.match(self.url).groupdict()["sid"]),
            self.tree

        )
        self.title = tree.xpath(FFA_TITLE)[0]
        self.author = tree.xpath(FFA_AUTHOR_NAME)[0]
        self.authorlink = 'http://www.hpfanficarchive.com/stories/' + \
            tree.xpath(FFA_AUTHOR_URL)[0]
    def parse_html(self):
        page = default_cache.get_page(self.get_url(), throttle=randint(1000, 4000) / 1000)
        tree = html.fromstring(page)

        self.title = (tree.xpath('//*[@id="profile_top"]/b/text()'))[0]
        self.summary = (tree.xpath('//*[@id="profile_top"]/div/text()'))[0]
        self.author += (tree.xpath('//*[@id="profile_top"]/a[1]/text()'))[0]
        self.authorlink = 'https://www.' + self.site + \
            tree.xpath('//*[@id="profile_top"]/a[1]/@href')[0]
        self.image = tree.xpath('//*[@id="profile_top"]/span[1]/img')

        self.raw_stats = []
        self.raw_stats.extend(
            tree.xpath('//*[@id="pre_story_links"]/span/a[last()]/text()'))
        self.raw_stats.extend(['\n'])

        # XPath changes depending on the presence of an image
        if len(self.image) is not 0:
            self.raw_stats.extend(tree.xpath('//*[@id="profile_top"]/span[4]//text()'))
        else:
            self.raw_stats.extend(tree.xpath('//*[@id="profile_top"]/span[3]//text()'))
示例#13
0
    def parse_html(self):
        tree = html.fromstring(default_cache.get_page(
            self.get_url(),

            # Got this header from the ficsave codebase
            headers={
                "Cookie": AFF_BYPASS_COOKIE
            },
            # Do not even try to follow to the adult form url.
            allow_redirects=False
        ))

        # We will generate the stats ourselves.
        self.stats = " - ".join((
            (title + ": " + str(result(
                tree, self.archive, self.id
            )))
            for title, result in AFF_GENERATED_META
        ))

        self.title = tree.xpath(AFF_TITLE_XPATH)[0].strip()[len("Story: "):]
        self.author = tree.xpath(AFF_AUTHOR_NAME)[0].strip()
        self.authorlink = tree.xpath(AFF_AUTHOR_URL)[0]