def __get_item_content(article: ResultSet):
     content = article.find('div', class_="text-container")
     if content is None:
         content = article.find('p', class_=None, recursive=True)
     if content is not None:
         return content.getText().strip()
     else:
         return None
 def __get_item_author(article: ResultSet):
     author = article.find(class_="author-block__info", recursive=True)
     if author is None:
         author = article.find(class_="story-block__byline", recursive=True)
     if author is not None:
         return author.getText().strip()
     else:
         return None
 def __get_item_content(article: ResultSet):
     content = article.find('p', class_="story-block__standfirst", recursive=True)
     if content is None:
         content = article.find('p', class_="standfirst-content", recursive=True)
     if content is None:
         content = article.find('p', class_=None)
     if content is not None:
         return content.getText().strip()
     else:
         return None
Пример #4
0
    def __init__(self, user_keys: dict, mobile=False, config=None):
        if config is None:
            config = {}

        self.near = config['near'] if 'near' in config else ''
        self.dark = config['dark'] if 'dark' in config else False
        self.nojs = config['nojs'] if 'nojs' in config else False
        self.new_tab = config['new_tab'] if 'new_tab' in config else False
        self.mobile = mobile
        self.user_keys = user_keys
        self.main_divs = ResultSet('')
        self._elements = 0
 def __get_item_url(article: ResultSet):
     headline_elem = AbcNewsItemParser.__get_headline_elem(article)
     # Not all articles use a common headline element, so get first link if no dedicated headline encountered
     if headline_elem is None:
         return article.find('a')['href']
     else:
         return headline_elem.find('a')['href']
    def get_price(self, listing: element.ResultSet) -> str:
        """Gets a price of an apartment from a "li" element passed in
        by finding the first "div" element.

        :param listing: "li" element for one apartment
        :type listing: element.ResultSet
        :return: price of an apartment
        :rtype: str
        """
        price = listing.find("div", class_="price").text
        return price
    def get_listing_link(self, listing: element.ResultSet) -> str:
        """Gets a link of a listing from a "li" element passed in
        by finding the first "a" element.

        :param listing: "li" element for one apartment
        :type listing: element.ResultSet
        :return: a link to a listing
        :rtype: str
        """
        listing_link = listing.find("a")["href"]
        return listing_link
    def get_title(self, listing: element.ResultSet) -> str:
        """Gets a title of a listing from a "li" element passed in
        by finding the first "h2" element.

        :param listing: "li" element for one apartment
        :type listing: element.ResultSet
        :return: title of an apartment's listing
        :rtype: str
        """
        title = listing.find("h2", class_="title-list").text
        return title
Пример #9
0
def _get_percentage_util(columns: ResultSet, row: PageElement) -> str:
    column_number = 0
    column: PageElement
    for column in columns:
        if str(column.get("contents")) == "Util%":
            column_number: int = int(columns.index(column))
            break
    siebling = row
    if column_number > 0:
        for _ in range(0, column_number - 1):
            siebling: Any = siebling.findNext("tablecell")
    return siebling.findNext("tablecell").get("contents")
Пример #10
0
def crawl_url(url: str) -> ResultSet(Tag):
    """This method uses Selenium WebDriver to run an automated Chrome Browser and crawl the page.
    This is required due to the Javascript in the Hackathon website which needs XHR requests
    to show all events in the year.

    Args:
        url: URL to be crawled

    Returns:
        BeautifulSoup ResultSet with relevant Page Source to be processed

    Raises:
        NoSuchElementException: When it can't find the More button anymore (Only for Debugging)
        StaleElementReferenceException: Exception raised when the More button is not seen in window


    """
    # Use selenium WebDriver to run an automated Chrome Browser.
    # This is required due to the Javascript in the Hackathon website which needs XHR requests
    # to show all events in the year.
    driver = webdriver.Chrome()
    driver.get(url)
    # TODO: Use more efficient method for waiting.
    time.sleep(2)
    scroll_down(driver)
    try:
        more_button_xpath = "/html/body/div[6]/div[3]/div[3]/a"
        more_button = driver.find_element_by_xpath(more_button_xpath)
    except NoSuchElementException:
        more_button_xpath = "/html/body/div[6]/div[2]/div[3]/a"
        more_button = driver.find_element_by_xpath(more_button_xpath)
    while True:
        scroll_down(driver)
        # TODO: Use more efficient method for waiting
        time.sleep(0.7)
        # TODO: optimize Try-Except
        try:
            driver.find_element_by_xpath(more_button_xpath)
        except NoSuchElementException as e:
            logging.debug(e)
            break
        try:
            more_button.click()
        except StaleElementReferenceException as e:
            logging.error(e)
    # Parse the read client by creating a BS4 object
    s_page = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    # We find the elements at the right side of the page
    container = s_page.find_all("div", {"class": ["ht-eb-card__right"]})
    # "row ht-idt-card__right__container"]})
    return container
    def get_attribute(self, listing: element.ResultSet, attribute_name: str) -> str:
        """ "Gets table row value by searching for corresponding keys which are
        passsed in as an attribute_name.

        :param listing: "li" element for one apartment
        :type listing: element.ResultSet
        :param attribute_name: a keyword to search for in a li element
        :type attribute_name: str
        :return: value that corresponds to a keyword
        :rtype: str
        """

        sq_meters = listing.find("span", title=attribute_name).text
        return sq_meters
    def _get_product_id(self, product: ResultSet) -> Optional[str]:
        """Return a product id from product url"""
        product_url_select = product.select(CSS_SELECTORS['url'])
        if not product_url_select:
            return None

        url = product_url_select[0].get('href')
        logger.info(f'Url: {url}')

        if 'slredirect' in url:
            url = self._request_executor.get_normal_url(url)

        product_id = url.split("/")[-2]
        logger.info(f"Get product id: {product_id}")
        return product_id
Пример #13
0
def parse_keywords(container: ResultSet(Tag), year: int) -> pd.DataFrame:
    """This method parses all the keywords that had shown up on the page.

    Args:
        year: year of the parsed url for dataframe.
        container: BeautifulSoup ResultSet with relevant Page Source to be processed

    Returns:
        A sorted Dataframe-Object with keywords, the year and its number of occurrences.

    """
    keyword_list = []  # Type: str
    final_list = []  # Type: Any
    for tag in container:
        tag_link_list = tag.find_all("a", {"class": "ht-card-tag"})
        for tag_link in tag_link_list:
            keyword_list.append(tag_link.contents[0])
    for k, v in Counter(keyword_list).items():
        final_list.append([year, k, v])
    data_frame = pd.DataFrame(final_list, columns=["Year", "Tag", "Count"])
    return data_frame
 def __get_headline_elem(article: ResultSet):
     # Article can sometimes have no headline, e.g. in Daily Cartoon
     return article.find(class_="story-block__heading")
Пример #15
0
class Filter:
    def __init__(self, user_keys: dict, mobile=False, config=None):
        if config is None:
            config = {}

        self.near = config['near'] if 'near' in config else ''
        self.dark = config['dark'] if 'dark' in config else False
        self.nojs = config['nojs'] if 'nojs' in config else False
        self.new_tab = config['new_tab'] if 'new_tab' in config else False
        self.alt_redirect = config['alts'] if 'alts' in config else False
        self.mobile = mobile
        self.user_keys = user_keys
        self.main_divs = ResultSet('')
        self._elements = 0

    def __getitem__(self, name):
        return getattr(self, name)

    @property
    def elements(self):
        return self._elements

    def reskin(self, page):
        # Aesthetic only re-skinning
        if self.dark:
            page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')

        return page

    def encrypt_path(self, msg, is_element=False):
        # Encrypts path to avoid plaintext results in logs
        if is_element:
            # Element paths are tracked differently in order for the element key to be regenerated
            # once all elements have been loaded
            enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode()
            self._elements += 1
            return enc_path

        return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode()

    def clean(self, soup):
        self.main_divs = soup.find('div', {'id': 'main'})
        self.remove_ads()
        self.fix_question_section()
        self.update_styling(soup)


        for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
            self.update_element_src(img, 'image/png')

        for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
            self.update_element_src(audio, 'audio/mpeg')

        for link in soup.find_all('a', href=True):
            self.update_link(link)

        input_form = soup.find('form')
        if input_form is not None:
            input_form['method'] = 'POST'

        # Ensure no extra scripts passed through
        for script in soup('script'):
            script.decompose()

        # Update default footer and header
        footer = soup.find('footer')
        if footer:
            # Remove divs that have multiple links beyond just page navigation
            [_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2]

        header = soup.find('header')
        if header:
            header.decompose()

        return soup

    def remove_ads(self):
        if not self.main_divs:
            return

        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)])
            _ = div.decompose() if has_ad else None

    def fix_question_section(self):
        if not self.main_divs:
            return

        question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0]
        for question_div in question_divs:
            questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')]
            for question in questions:
                question['style'] = 'padding: 10px; font-style: italic;'

    def update_element_src(self, element, mime):
        element_src = element['src']
        if element_src.startswith('//'):
            element_src = 'https:' + element_src
        elif element_src.startswith(LOGO_URL):
            # Re-brand with Whoogle logo
            element['src'] = '/static/img/logo.png'
            element['style'] = 'height:40px;width:162px'
            return
        elif element_src.startswith(GOOG_IMG):
            element['src'] = BLANK_B64
            return

        element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \
                         '&type=' + urlparse.quote(mime)
        # TODO: Non-mobile image results link to website instead of image
        # if not self.mobile:
        # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser'))

    def update_styling(self, soup):
        # Remove unnecessary button(s)
        for button in soup.find_all('button'):
            button.decompose()

        # Remove svg logos
        for svg in soup.find_all('svg'):
            svg.decompose()

        # Update logo
        logo = soup.find('a', {'class': 'l'})
        if logo and self.mobile:
            logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \
                            'font-size:18px; '

        # Fix search bar length on mobile
        try:
            search_bar = soup.find('header').find('form').find('div')
            search_bar['style'] = 'width: 100%;'
        except AttributeError:
            pass

    def update_link(self, link):
        # Replace href with only the intended destination (no "utm" type tags)
        href = link['href'].replace('https://www.google.com', '')
        if '/advanced_search' in href or 'tbm=shop' in href:
            # TODO: The "Shopping" tab requires further filtering (see #136)
            # Temporarily removing all links to that tab for now.
            link.decompose()
            return
        elif self.new_tab:
            link['target'] = '_blank'

        result_link = urlparse.urlparse(href)
        query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''

        if query_link.startswith('/'):
            # Internal google links (i.e. mail, maps, etc) should still be forwarded to Google
            link['href'] = 'https://google.com' + query_link
        elif '/search?q=' in href:
            # "li:1" implies the query should be interpreted verbatim, so we wrap it in double quotes
            if 'li:1' in href:
                query_link = '"' + query_link + '"'
            new_search = '/search?q=' + self.encrypt_path(query_link)

            query_params = parse_qs(urlparse.urlparse(href).query)
            for param in VALID_PARAMS:
                param_val = query_params[param][0] if param in query_params else ''
                new_search += '&' + param + '=' + param_val
            link['href'] = new_search
        elif 'url?q=' in href:
            # Strip unneeded arguments
            link['href'] = filter_link_args(query_link)

            # Add no-js option
            if self.nojs:
                gen_nojs(link)
        else:
            link['href'] = href

        # Replace link location if "alts" config is enabled
        if self.alt_redirect:
            # Search and replace all link descriptions with alternative location
            link['href'] = get_site_alt(link['href'])
            link_desc = link.find_all(text=re.compile('|'.join(SITE_ALTS.keys())))
            if len(link_desc) == 0:
                return

            # Replace link destination
            link_desc[0].replace_with(get_site_alt(link_desc[0]))
 def __get_headline_elem(article: ResultSet):
     headline_elem = article.find('h3')
     return headline_elem
Пример #17
0
class Filter:
    def __init__(self, user_keys: dict, mobile=False, config=None):
        if config is None:
            config = {}

        self.near = config['near'] if 'near' in config else ''
        self.dark = config['dark'] if 'dark' in config else False
        self.nojs = config['nojs'] if 'nojs' in config else False
        self.new_tab = config['new_tab'] if 'new_tab' in config else False
        self.mobile = mobile
        self.user_keys = user_keys
        self.main_divs = ResultSet('')
        self._elements = 0

    def __getitem__(self, name):
        return getattr(self, name)

    @property
    def elements(self):
        return self._elements

    def reskin(self, page):
        # Aesthetic only re-skinning
        page = page.replace('>G<', '>Wh<')
        pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05',
                             re.IGNORECASE)
        page = pattern.sub('685e79', page)
        if self.dark:
            page = page.replace('fff', '000').replace('202124', 'ddd').replace(
                '1967D2', '3b85ea')

        return page

    def encrypt_path(self, msg, is_element=False):
        # Encrypts path to avoid plaintext results in logs
        if is_element:
            # Element paths are tracked differently in order for the element key to be regenerated
            # once all elements have been loaded
            enc_path = Fernet(self.user_keys['element_key']).encrypt(
                msg.encode()).decode()
            self._elements += 1
            return enc_path

        return Fernet(self.user_keys['text_key']).encrypt(
            msg.encode()).decode()

    def clean(self, soup):
        self.main_divs = soup.find('div', {'id': 'main'})
        self.remove_ads()
        self.fix_question_section()
        self.update_styling(soup)

        for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
            self.update_element_src(img, 'image/png')

        for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
            self.update_element_src(audio, 'audio/mpeg')

        for link in soup.find_all('a', href=True):
            self.update_link(link)

        input_form = soup.find('form')
        if input_form is not None:
            input_form['method'] = 'POST'

        # Ensure no extra scripts passed through
        for script in soup('script'):
            script.decompose()

        # Update default footer and header
        footer = soup.find('footer')
        if footer:
            # Remove divs that have multiple links beyond just page navigation
            [
                _.decompose() for _ in footer.find_all('div', recursive=False)
                if len(_.find_all('a', href=True)) > 2
            ]

        header = soup.find('header')
        if header:
            header.decompose()

        return soup

    def remove_ads(self):
        if not self.main_divs:
            return

        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            has_ad = len([
                _ for _ in div.find_all('span', recursive=True)
                if has_ad_content(_.text)
            ])
            _ = div.decompose() if has_ad else None

    def fix_question_section(self):
        if not self.main_divs:
            return

        question_divs = [
            _ for _ in self.main_divs.find_all('div', recursive=False)
            if len(_.find_all('h2')) > 0
        ]
        for question_div in question_divs:
            questions = [
                _ for _ in question_div.find_all('div', recursive=True)
                if _.text.endswith('?')
            ]
            for question in questions:
                question['style'] = 'padding: 10px; font-style: italic;'

    def update_element_src(self, element, mime):
        element_src = element['src']
        if element_src.startswith('//'):
            element_src = 'https:' + element_src
        elif element_src.startswith(LOGO_URL):
            # Re-brand with Whoogle logo
            element['src'] = '/static/img/logo.png'
            element['style'] = 'height:40px;width:162px'
            return
        elif element_src.startswith(GOOG_IMG):
            element['src'] = BLANK_B64
            return

        element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \
                         '&type=' + urlparse.quote(mime)
        # TODO: Non-mobile image results link to website instead of image
        # if not self.mobile:
        # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser'))

    def update_styling(self, soup):
        # Remove unnecessary button(s)
        for button in soup.find_all('button'):
            button.decompose()

        # Remove svg logos
        for svg in soup.find_all('svg'):
            svg.decompose()

        # Update logo
        logo = soup.find('a', {'class': 'l'})
        if logo and self.mobile:
            logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \
                            'font-size:18px; '

        # Fix search bar length on mobile
        try:
            search_bar = soup.find('header').find('form').find('div')
            search_bar['style'] = 'width: 100%;'
        except AttributeError:
            pass

        # Set up dark mode if active
        if self.dark:
            soup.find(
                'html'
            )['style'] = 'scrollbar-color: #333 #111;color:#fff !important;background:#000 !important'
            for input_element in soup.findAll('input'):
                input_element['style'] = 'color:#fff;background:#000;'

            for span_element in soup.findAll('span'):
                span_element['style'] = 'color: white;'

            for href_element in soup.findAll('a'):
                href_element['style'] = 'color: white' if href_element[
                    'href'].startswith('/search') else ''

    def update_link(self, link):
        # Replace href with only the intended destination (no "utm" type tags)
        href = link['href'].replace('https://www.google.com', '')
        if '/advanced_search' in href:
            link.decompose()
            return
        elif self.new_tab:
            link['target'] = '_blank'

        result_link = urlparse.urlparse(href)
        query_link = parse_qs(
            result_link.query)['q'][0] if '?q=' in href else ''

        if query_link.startswith('/'):
            link['href'] = 'https://google.com' + query_link
        elif '/search?q=' in href:
            new_search = '/search?q=' + self.encrypt_path(query_link)

            query_params = parse_qs(urlparse.urlparse(href).query)
            for param in VALID_PARAMS:
                param_val = query_params[param][
                    0] if param in query_params else ''
                new_search += '&' + param + '=' + param_val
            link['href'] = new_search
        elif 'url?q=' in href:
            # Strip unneeded arguments
            link['href'] = filter_link_args(query_link)

            # Add no-js option
            if self.nojs:
                gen_nojs(link)
        else:
            link['href'] = href
 def __get_headline_elem(article: ResultSet):
     headline_elem = article.find(class_='headline', recursive=True)
     return headline_elem
Пример #19
0
 def __init__(self, user_key: str, config: Config, mobile=False) -> None:
     self.config = config
     self.mobile = mobile
     self.user_key = user_key
     self.main_divs = ResultSet('')
     self._elements = 0
Пример #20
0
class Filter:
    # Limit used for determining if a result is a "regular" result or a list
    # type result (such as "people also asked", "related searches", etc)
    RESULT_CHILD_LIMIT = 7

    def __init__(self, user_key: str, config: Config, mobile=False) -> None:
        self.config = config
        self.mobile = mobile
        self.user_key = user_key
        self.main_divs = ResultSet('')
        self._elements = 0

    def __getitem__(self, name):
        return getattr(self, name)

    @property
    def elements(self):
        return self._elements

    def encrypt_path(self, path, is_element=False) -> str:
        # Encrypts path to avoid plaintext results in logs
        if is_element:
            # Element paths are encrypted separately from text, to allow key
            # regeneration once all items have been served to the user
            enc_path = Fernet(self.user_key).encrypt(path.encode()).decode()
            self._elements += 1
            return enc_path

        return Fernet(self.user_key).encrypt(path.encode()).decode()

    def clean(self, soup) -> BeautifulSoup:
        self.main_divs = soup.find('div', {'id': 'main'})
        self.remove_ads()
        self.remove_block_titles()
        self.remove_block_url()
        self.collapse_sections()
        self.update_styling(soup)

        for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
            self.update_element_src(img, 'image/png')

        for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
            self.update_element_src(audio, 'audio/mpeg')

        for link in soup.find_all('a', href=True):
            self.update_link(link)

        input_form = soup.find('form')
        if input_form is not None:
            input_form['method'] = 'GET' if self.config.get_only else 'POST'

        # Ensure no extra scripts passed through
        for script in soup('script'):
            script.decompose()

        # Update default footer and header
        footer = soup.find('footer')
        if footer:
            # Remove divs that have multiple links beyond just page navigation
            [
                _.decompose() for _ in footer.find_all('div', recursive=False)
                if len(_.find_all('a', href=True)) > 3
            ]

        header = soup.find('header')
        if header:
            header.decompose()

        return soup

    def remove_ads(self) -> None:
        """Removes ads found in the list of search result divs

        Returns:
            None (The soup object is modified directly)
        """
        if not self.main_divs:
            return

        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            div_ads = [
                _ for _ in div.find_all('span', recursive=True)
                if has_ad_content(_.text)
            ]
            _ = div.decompose() if len(div_ads) else None

    def remove_block_titles(self) -> None:
        if not self.main_divs or not self.config.block_title:
            return
        block_title = re.compile(self.block_title)
        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            block_divs = [
                _ for _ in div.find_all('h3', recursive=True)
                if block_title.search(_.text) is not None
            ]
            _ = div.decompose() if len(block_divs) else None

    def remove_block_url(self) -> None:
        if not self.main_divs or not self.config.block_url:
            return
        block_url = re.compile(self.block_url)
        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            block_divs = [
                _ for _ in div.find_all('a', recursive=True)
                if block_url.search(_.attrs['href']) is not None
            ]
            _ = div.decompose() if len(block_divs) else None

    def collapse_sections(self) -> None:
        """Collapses long result sections ("people also asked", "related
         searches", etc) into "details" elements

        These sections are typically the only sections in the results page that
        have more than ~5 child divs within a primary result div.

        Returns:
            None (The soup object is modified directly)
        """
        minimal_mode = read_config_bool('WHOOGLE_MINIMAL')

        def pull_child_divs(result_div: BeautifulSoup):
            try:
                return result_div.findChildren(
                    'div', recursive=False)[0].findChildren('div',
                                                            recursive=False)
            except IndexError:
                return []

        if not self.main_divs:
            return

        # Loop through results and check for the number of child divs in each
        for result in self.main_divs:
            result_children = pull_child_divs(result)
            if minimal_mode:
                if len(result_children) in (1, 3):
                    continue
            else:
                if len(result_children) < self.RESULT_CHILD_LIMIT:
                    continue

            # Find and decompose the first element with an inner HTML text val.
            # This typically extracts the title of the section (i.e. "Related
            # Searches", "People also ask", etc)
            label = 'Collapsed Results'
            for elem in result_children:
                if elem.text:
                    label = elem.text
                    elem.decompose()
                    break

            # Create the new details element to wrap around the result's
            # first parent
            parent = None
            idx = 0
            while not parent and idx < len(result_children):
                parent = result_children[idx].parent
                idx += 1

            details = BeautifulSoup(features='html.parser').new_tag('details')
            summary = BeautifulSoup(features='html.parser').new_tag('summary')
            summary.string = label
            details.append(summary)

            if parent and not minimal_mode:
                parent.wrap(details)
            elif parent and minimal_mode:
                # Remove parent element from document if "minimal mode" is
                # enabled
                parent.decompose()

    def update_element_src(self, element: Tag, mime: str) -> None:
        """Encrypts the original src of an element and rewrites the element src
        to use the "/element?src=" pass-through.

        Returns:
            None (The soup element is modified directly)

        """
        src = element['src']

        if src.startswith('//'):
            src = 'https:' + src

        if src.startswith(LOGO_URL):
            # Re-brand with Whoogle logo
            element.replace_with(
                BeautifulSoup(render_template('logo.html'),
                              features='html.parser'))
            return
        elif src.startswith(GOOG_IMG) or GOOG_STATIC in src:
            element['src'] = BLANK_B64
            return

        element['src'] = f'{Endpoint.element}?url=' + self.encrypt_path(
            src, is_element=True) + '&type=' + urlparse.quote(mime)

    def update_styling(self, soup) -> None:
        # Remove unnecessary button(s)
        for button in soup.find_all('button'):
            button.decompose()

        # Remove svg logos
        for svg in soup.find_all('svg'):
            svg.decompose()

        # Update logo
        logo = soup.find('a', {'class': 'l'})
        if logo and self.mobile:
            logo['style'] = ('display:flex; justify-content:center; '
                             'align-items:center; color:#685e79; '
                             'font-size:18px; ')

        # Fix search bar length on mobile
        try:
            search_bar = soup.find('header').find('form').find('div')
            search_bar['style'] = 'width: 100%;'
        except AttributeError:
            pass

    def update_link(self, link: Tag) -> None:
        """Update internal link paths with encrypted path, otherwise remove
        unnecessary redirects and/or marketing params from the url

        Args:
            link: A bs4 Tag element to inspect and update

        Returns:
            None (the tag is updated directly)

        """
        # Replace href with only the intended destination (no "utm" type tags)
        href = link['href'].replace('https://www.google.com', '')
        if 'advanced_search' in href or 'tbm=shop' in href:
            # FIXME: The "Shopping" tab requires further filtering (see #136)
            # Temporarily removing all links to that tab for now.
            link.decompose()
            return

        result_link = urlparse.urlparse(href)
        q = extract_q(result_link.query, href)

        if q.startswith('/'):
            # Internal google links (i.e. mail, maps, etc) should still
            # be forwarded to Google
            link['href'] = 'https://google.com' + q
        elif '/search?q=' in href:
            # "li:1" implies the query should be interpreted verbatim,
            # which is accomplished by wrapping the query in double quotes
            if 'li:1' in href:
                q = '"' + q + '"'
            new_search = 'search?q=' + self.encrypt_path(q)

            query_params = parse_qs(urlparse.urlparse(href).query)
            for param in VALID_PARAMS:
                if param not in query_params:
                    continue
                param_val = query_params[param][0]
                new_search += '&' + param + '=' + param_val
            link['href'] = new_search
        elif 'url?q=' in href:
            # Strip unneeded arguments
            link['href'] = filter_link_args(q)

            # Add no-js option
            if self.config.nojs:
                append_nojs(link)

            if self.config.new_tab:
                link['target'] = '_blank'
        else:
            if href.startswith(MAPS_URL):
                # Maps links don't work if a site filter is applied
                link['href'] = MAPS_URL + "?q=" + clean_query(q)
            else:
                link['href'] = href

        # Replace link location if "alts" config is enabled
        if self.config.alts:
            # Search and replace all link descriptions
            # with alternative location
            link['href'] = get_site_alt(link['href'])
            link_desc = link.find_all(
                text=re.compile('|'.join(SITE_ALTS.keys())))
            if len(link_desc) == 0:
                return

            # Replace link description
            link_desc = link_desc[0]
            for site, alt in SITE_ALTS.items():
                if site not in link_desc:
                    continue
                new_desc = BeautifulSoup(features='html.parser').new_tag('div')
                new_desc.string = str(link_desc).replace(site, alt)
                link_desc.replace_with(new_desc)
                break

    def view_image(self, soup) -> BeautifulSoup:
        """Replaces the soup with a new one that handles mobile results and
        adds the link of the image full res to the results.

        Args:
            soup: A BeautifulSoup object containing the image mobile results.

        Returns:
            BeautifulSoup: The new BeautifulSoup object
        """

        # get some tags that are unchanged between mobile and pc versions
        search_input = soup.find_all('td', attrs={'class': "O4cRJf"})[0]
        search_options = soup.find_all('div', attrs={'class': "M7pB2"})[0]
        cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
        next_pages = soup.find_all('table', attrs={'class': "uZgmoc"})[0]
        information = soup.find_all('div', attrs={'class': "TuS8Ad"})[0]

        results = []
        # find results div
        results_div = soup.find_all('div', attrs={'class': "nQvrDb"})[0]
        # find all the results
        results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})

        for item in results_all:
            urls = item.find('a')['href'].split('&imgrefurl=')

            # Skip urls that are not two-element lists
            if len(urls) != 2:
                continue

            img_url = urlparse.unquote(urls[0].replace(
                f'/{Endpoint.imgres}?imgurl=', ''))

            try:
                # Try to strip out only the necessary part of the web page link
                web_page = urlparse.unquote(urls[1].split('&')[0])
            except IndexError:
                web_page = urlparse.unquote(urls[1])

            img_tbn = urlparse.unquote(item.find('a').find('img')['src'])

            results.append({
                'domain': urlparse.urlparse(web_page).netloc,
                'img_url': img_url,
                'web_page': web_page,
                'img_tbn': img_tbn
            })

        soup = BeautifulSoup(render_template('imageresults.html',
                                             length=len(results),
                                             results=results,
                                             view_label="View Image"),
                             features='html.parser')
        # replace search input object
        soup.find_all('td', attrs={'class':
                                   "O4cRJf"})[0].replaceWith(search_input)
        # replace search options object (All, Images, Videos, etc.)
        soup.find_all('div', attrs={'class':
                                    "M7pB2"})[0].replaceWith(search_options)
        # replace correction suggested by google object if exists
        if len(cor_suggested):
            soup.find_all('table',
                          attrs={'class':
                                 "By0U9"})[0].replaceWith(cor_suggested[0])
        # replace next page object at the bottom of the page
        soup.find_all('table', attrs={'class':
                                      "uZgmoc"})[0].replaceWith(next_pages)
        # replace information about user connection at the bottom of the page
        soup.find_all('div', attrs={'class':
                                    "TuS8Ad"})[0].replaceWith(information)
        return soup
Пример #21
0
 def __get_item_url(article: ResultSet):
     return article.find('a', class_="story__link")['href']
 def __get_item_author(article: ResultSet):
     byline = article.find(class_='byline')
     if byline is not None:
         return byline.find('a').getText().strip()
     else:
         return None
 def __get_item_url(article: ResultSet):
     headline = TheAustralianNewsItemParser.__get_headline_elem(article)
     if headline is None:
         return article.find('a')['href']
     else:
         return headline.find('a')['href']
 def __get_topic_text(article: ResultSet):
     topic = article.find(class_="story-block__kicker")
     if topic is not None:
         return topic.getText().strip()
     else:
         return None
Пример #25
0
 def __get_headline_text(article: ResultSet):
     return article.find(class_="story__headline__text").getText().strip()
Пример #26
0
def get_cells(row: element.ResultSet) -> List[str]:
    """
    Gets all th and tr elements within a single tr element
    """
    return [el.text for el in row.find_all(['th', 'td'])]
 def __get_headline_text(article: ResultSet):
     headline_elem = AbcNewsItemParser.__get_headline_elem(article)
     # Not all articles use a common headline element, if not found get the link text
     if headline_elem is None:
         headline_elem = article.find('a')
     return headline_elem.getText().strip()
 def __get_topic_text(article: ResultSet):
     topic = article.find(class_="topic__string")
     if topic is not None:
         return topic.getText().strip()
     else:
         return None
Пример #29
0
 def _find_advert_anhor(self, div: element.ResultSet) -> element.Tag:
     """ Function which search for an anhor in div. """
     return div.find('a', {'href': True, 'class': True, 'title': False})