示例#1
0
    def getCookie(self):
        if self.checkCookie()[0]:
            return True, "Already logged in"

        self.log.info("Trying to avoid rate limiting!")
        time.sleep(5)

        self.log.info("Not logged in. Doing login.")

        home = "https://www.patreon.com/home"
        login = "******"
        try:
            soup = self.get_soup(home)
        except Exception as e:

            import pdb
            print(e)
            pdb.set_trace()
            print(e)

        # These won't work for the particular recaptcha flavor patreon uses. Sigh.
        if soup.find_all("div", class_="g-recaptcha"):
            soup = self.handle_recaptcha(soup, home, login)

        if soup.find_all("div", id="hcaptcha_widget"):
            soup = self.handle_hcaptcha(soup, home, login)

        if soup.find_all("div", class_="g-recaptcha"):
            self.log.error("Failed after attempting to solve recaptcha!")
            raise exceptions.NotLoggedInException(
                "Login failed due to recaptcha!")

        if soup.find_all("div", class_="hcaptcha_widget"):
            self.log.error("Failed after attempting to solve recaptcha!")
            raise exceptions.NotLoggedInException(
                "Login failed due to recaptcha!")

        # So patreon uses RavenJS, which does a bunch of really horrible change-watching crap on input
        # fields. I couldn't figure out how to properly fire the on-change events, so let's just
        # use the debug-protocol interface to type our login info in manually.
        self.cr.execute_javascript_function(
            "document.querySelector(\"input[id='email']\").focus()")
        for char in settings[self.pluginShortName]['username']:
            self.cr.Input_dispatchKeyEvent(type='char', text=char)
        self.cr.execute_javascript_statement(
            "document.querySelector(\"input[id='password']\").focus()")
        for char in settings[self.pluginShortName]['password']:
            self.cr.Input_dispatchKeyEvent(type='char', text=char)

        self.cr.execute_javascript_statement(
            "document.querySelector(\"button[type='submit']\").click()")

        content = self.cr.get_rendered_page_source()

        if not settings[self.pluginShortName]['username'] in content:
            raise exceptions.CannotAccessException("Could not log in?")

        self.wg.saveCookies()

        return self.checkCookie()
示例#2
0
    def _get_art_post(self, postId, artistName):
        post = self.get_api_json("/posts/{pid}".format(pid=postId) +
                                 "?include=media")

        post_content = post['data']
        post_info = post_content['attributes']

        if 'current_user_can_view' in post_info and post_info[
                'current_user_can_view'] is False:
            self.log.warning(
                "You apparently cannot view post %s for artist %s. Ignoring.",
                postId, artistName)
            fail = {'status': ''}
            return fail

        if not 'included' in post:
            self.log.warning(
                "No contents on post %s for artist %s (%s). Please report if this is in error.",
                postId, artistName, post_info['url'])
            fail = {'status': ''}
            return fail

        attachments = {
            item['id']: item
            for item in post['included'] if item['type'] == 'attachment'
        }
        media = {
            item['id']: item
            for item in post['included'] if item['type'] == 'media'
        }

        tags = []
        if 'user_defined_tags' in post_content['relationships']:
            for tagmeta in post_content['relationships']['user_defined_tags'][
                    'data']:
                tags.append(tagmeta['id'].split(";")[-1])

        if 'current_user_can_view' in post_content and not post_content[
                'current_user_can_view']:
            raise exceptions.CannotAccessException(
                "You can't view that content!")

        # if not 'content' in post_info:
        pprint.pprint(post_content)

        ret = {
            'page_desc':
            post_info['content'],
            'page_title':
            post_info['title'],
            'post_time':
            dateutil.parser.parse(
                post_info['published_at']).replace(tzinfo=None),
            'post_tags':
            tags,
            'post_embeds': [],
        }

        # print("Post:")
        # pprint.pprint(post)
        # print("Content:")
        # pprint.pprint(post_info['content'])

        files = []
        try:
            if "post_file" in post_info and post_info['post_file']:
                furl = urllib.parse.unquote(post_info['post_file']['url'])
                # print("Post file!", post_info['post_file']['url'], furl)
                fpath = self.save_image(artistName, postId,
                                        post_info['post_file']['name'], furl)
                files.append(fpath)

            if 'post_type' in post_info and post_info[
                    'post_type'] == 'video_embed':
                # print("Post video_embed")
                fpath = self.fetch_video_embed(post_info)
                if fpath:
                    files.append(fpath)
                ret['post_embeds'].append(post_info)

            for aid, dat_struct in attachments.items():
                # print("Post attachments")
                fpath = self.save_attachment(artistName, postId, dat_struct)
                files.append(fpath)

            for aid, dat_struct in media.items():
                # print("Post attachments")
                fpath = self.save_media(artistName, postId, dat_struct)
                files.append(fpath)

            if 'embed' in post_info and post_info['embed']:
                for item in self._handle_embed(post_info['embed']):
                    files.append(fpath)
                ret['post_embeds'].append(post_info['embed'])

        except urllib.error.URLError:
            self.log.error("Failure retreiving content from post: %s", post)

        ctnt_soup = bs4.BeautifulSoup(post_info['content'], 'lxml')
        for img in ctnt_soup.find_all("img", src=True):
            furl = img['src']
            fparsed = urllib.parse.urlparse(furl)
            fname = fparsed.path.split("/")[-1]
            fpath = self.save_image(artistName, postId, fname, furl)
            files.append(fpath)

        # Youtube etc are embedded as iframes.
        for ifr in ctnt_soup.find_all("iframe", src=True):
            ret['post_embeds'].append(ifr['src'])

        if len(files):
            self.log.info("Found %s images/attachments on post.",
                          len(attachments))
        else:
            self.log.warning("No images/attachments on post %s!", postId)

        files = [filen for filen in files if filen]
        ret['dl_path'] = files
        ret['status'] = 'Succeeded'

        # pprint.pprint(ret)
        return ret
示例#3
0
    def _getArtPage(self, dlPathBase, artPageUrl, artistName):

        soup = self.wg.getSoup(artPageUrl)
        soups = str(soup)
        if 'That submission has been deleted.' in soups:
            self.log.warning("Item %s has been deleted by the poster!",
                             artPageUrl)
            return self.build_page_ret(status="Deleted", fqDlPath=None)

        if 'The owner has restricted this submission to members' in soups:
            raise exceptions.NotLoggedInException("Not logged in?")
        if "You may need to go to that member's userpage (use name link above) and request that they give you access" in soups:
            raise exceptions.CannotAccessException(
                "Friends only thingie (why is this even a thing)!")

        datespan = soup.find('span', id='submittime_exact')
        if not datespan:
            self.log.warning("No date-span in content on page '%s'!!",
                             artPageUrl)
            return self.build_page_ret(status='Failed',
                                       pageTitle="Missing content!",
                                       fqDlPath=None)

        postTime = self._extractPostTimestamp(soup)
        postTags = self._extractPostTags(soup)

        titleBar, dummy_stats, dummy_footer = soup.body.find_all(
            'div', class_='elephant_555753', recursive=False)

        story_div = None

        mainDivs = soup.body.find_all('div',
                                      class_='elephant_white',
                                      recursive=False)
        print("Maindivs: ", len(mainDivs))
        if len(mainDivs) == 2:
            print("Sequence item!")
            imgDiv, desc_div = mainDivs
            footer = desc_div
            imageURL = [self._getContentUrlFromPage(imgDiv)]
        elif len(mainDivs) == 3 or len(mainDivs) == 4:
            if len(mainDivs) == 3:
                imgDiv, seqDiv, desc_div = mainDivs
                footer = desc_div
            if len(mainDivs) == 4:
                imgDiv, seqDiv, desc_div, footer = mainDivs

            imageURL = set()
            base_img = self._getContentUrlFromPage(imgDiv)
            if base_img:
                imageURL.add(base_img)

            for img in self._getSeqImageDivs(seqDiv):
                imageURL.add(img)
            for img in self._getSeqImageDivs(desc_div):
                imageURL.add(img)

            self.log.info("Found %s item series on page!", len(imageURL))

        elif len(mainDivs) == 5:
            _header_div, img_div, story_div, desc_div, footer = mainDivs

            imageURL = set()
            base_img = self._getContentUrlFromPage(img_div)
            if base_img:
                imageURL.add(base_img)

            for img in self._getSeqImageDivs(img_div):
                imageURL.add(img)
            for img in self._getSeqImageDivs(story_div):
                imageURL.add(img)
            for img in self._getSeqImageDivs(desc_div):
                imageURL.add(img)

            self.log.info("Found %s item series on page!", len(imageURL))
        else:
            soupp = str(mainDivs)
            with open("log.html", "w") as fp:
                fp.write(soupp)
            print(soupp)

            raise ValueError("Unknown number of mainDivs! %s" % len(mainDivs))

        # print(imageURL)

        itemTitle = self._extractTitle(titleBar)
        itemCaption = self._extractDescription(story_div, desc_div, footer)

        if not imageURL:
            self.log.error("OH NOES!!! No image on page = " + artPageUrl)
            return self.build_page_ret(status="Failed", fqDlPath=None)

        imPaths = []
        for image in imageURL:
            recPath = self._fetchImage(image, dlPathBase, itemCaption,
                                       itemTitle, artPageUrl)
            imPaths.append(recPath)

        return self.build_page_ret(status="Succeeded",
                                   fqDlPath=imPaths,
                                   pageDesc=itemCaption,
                                   pageTitle=itemTitle,
                                   postTags=postTags,
                                   postTime=postTime)