Exemplo n.º 1
0
 def _parse_html(self):
     # TODO: Why are we uncommenting HTML?
     self.html = utils.make_html_element(
         self.response.text.replace('<!--', '').replace('-->', ''),
         url=self.response.url,
     )
     self.cursor_blob = self.response.text
Exemplo n.º 2
0
    def extract_text(self) -> PartialPost:
        # Open this article individually because not all content is fully loaded when skimming
        # through pages.
        # This ensures the full content can be read.

        element = self.element

        has_more = self.more_url_regex.search(element.html)
        if has_more:
            match = self.post_story_regex.search(element.html)
            if match:
                url = utils.urljoin(FB_MOBILE_BASE_URL,
                                    match.groups()[0].replace("&amp;", "&"))
                response = self.request(url)
                element = response.html.find('.story_body_container',
                                             first=True)

        nodes = element.find('p, header')
        if nodes:
            post_text = []
            shared_text = []
            ended = False
            for node in nodes[1:]:
                if node.tag == 'header':
                    ended = True

                # Remove '... More'
                # This button is meant to display the hidden text that is already loaded
                # Not to be confused with the 'More' that opens the article in a new page
                if node.tag == 'p':
                    node = utils.make_html_element(html=node.html.replace(
                        '>… <', '><', 1).replace('>More<', '', 1))

                if not ended:
                    post_text.append(node.text)
                else:
                    shared_text.append(node.text)

            # Separation between paragraphs
            paragraph_separator = '\n\n'

            text = paragraph_separator.join(
                itertools.chain(post_text, shared_text))
            post_text = paragraph_separator.join(post_text)
            shared_text = paragraph_separator.join(shared_text)

            return {
                'text': text,
                'post_text': post_text,
                'shared_text': shared_text,
            }

        return None
Exemplo n.º 3
0
    def _parse_json(self):
        prefix_length = len(self.json_prefix)
        data = json.loads(
            self.response.text[prefix_length:])  # Strip 'for (;;);'

        for action in data['payload']['actions']:
            if action['cmd'] == 'replace':
                self.html = utils.make_html_element(action['html'],
                                                    url=FB_MOBILE_BASE_URL)
            elif action['cmd'] == 'script':
                self.cursor_blob = action['code']

        assert self.html is not None
        assert self.cursor_blob is not None
Exemplo n.º 4
0
    def extract_text(self) -> PartialPost:
        # Open this article individually because not all content is fully loaded when skimming
        # through pages.
        # This ensures the full content can be read.

        element = self.element

        has_more = self.more_url_regex.search(element.html)
        if has_more:
            match = self.post_story_regex.search(element.html)
            if match:
                url = utils.urljoin(FB_MOBILE_BASE_URL,
                                    match.groups()[0].replace("&amp;", "&"))
                response = self.request(url)
                element = response.html.find('.story_body_container',
                                             first=True)

        # nodes = element.find('p, header')
        # better facebook-protection handling
        # ===========================================
        try:
            # will fail if article is None
            nodes = element.find('p, header')
        except Exception as e1:
            print(f"\n{url}")
            print(e1)
            print(response.html.find("title", first=True).text[:20])

            try:
                global WD
                if WD is None:
                    options = webdriver.ChromeOptions()
                    options.add_argument(
                        "--lang=en-US")  # force english chrome
                    WD = webdriver.Chrome('./chromedriver',
                                          chrome_options=options)
                WD.get(url)
                input("press <ENTER> when done with chrome window...")
                #article = HTML(html=WD.find_element_by_css_selector(".story_body_container")[0].get_attribute("outerHTML"), url=_base_url)
                css_sel = WD.find_elements_by_css_selector(
                    ".story_body_container"
                ) or WD.find_elements_by_css_selector("#MPhotoContent") or None
                element = rHTML(html=css_sel[0].get_attribute("outerHTML"),
                                url=FB_MOBILE_BASE_URL)
            except Exception as e:
                print(f"\nWebDriver failed.\n{e}")

            # will fail if WD failed/didn't find the selectors, so it will actually "bubble" up and stop the entire scraper
            nodes = element.find('p, header')
        # ===========================================

        if nodes:
            post_text = []
            shared_text = []
            ended = False
            for node in nodes[1:]:
                if node.tag == 'header':
                    ended = True

                # Remove '... More'
                # This button is meant to display the hidden text that is already loaded
                # Not to be confused with the 'More' that opens the article in a new page
                if node.tag == 'p':
                    node = utils.make_html_element(html=node.html.replace(
                        '>… <', '><', 1).replace('>More<', '', 1))

                if not ended:
                    post_text.append(node.text)
                else:
                    shared_text.append(node.text)

            text = '\n'.join(itertools.chain(post_text, shared_text))
            post_text = '\n'.join(post_text)
            shared_text = '\n'.join(shared_text)

            return {
                'text': text,
                'post_text': post_text,
                'shared_text': shared_text,
            }

        return None