示例#1
0
class WebpageSummarizer(object):
    """
    Generates summary of a given web page.
    """
    def __init__(self):
        self.browser = StatefulBrowser(user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0')
        self.browser.session.headers.update({'Upgrade-Insecure-Requests': '1'})

    def summarize_webpage(self, url, summarization_ratio):
        """
        Takes a web page URL and returns the title and a summary of the web page.
        :param url: Web page URL.
        :param summarization_ratio: Fraction of original text to include in the summary.
        :return: Web page title and summarized web page text.
        """
        title = summarized_text = ''
        try:
            self.browser.open(url)
            page = self.browser.get_current_page()
            # Find all the paragraphs because they contain the main web page text
            page_text = ' '.join(map(lambda p: p.text, page.find_all('p')))
            title = page.title.text.strip()
            # Generate a summary of the given web page text if it contains more than 10 sentences
            summarized_text = summarize(page_text, ratio=summarization_ratio).strip()
            if summarized_text == '':
                summarized_text = page_text
        except Exception as e:
            print(e)
        finally:
            self.browser.close()

        return title, summarized_text
示例#2
0
def scrape_HTML(url):
    """Scrapes the HTML from W4MPJobs"""
    browser = StatefulBrowser()
    page = browser.open(url)
    form = Form(page.soup.form)

    # Selects all on the number of results radio button
    number_results_data = {"ctl00$MainContent$RadioButtonList2": 9999}
    form.set_radio(number_results_data)

    # Selects NWM or more on salary radio button
    salary_data = {"ctl00$MainContent$rblSalary": "nmwormore"}
    form.set_radio(salary_data)

    # Selects outside London on the location radio button – other options commented out
    location_data = {"ctl00$MainContent$rblJobs": "outside"}
    # location_data = {"ctl00$MainContent$rblJobs": "inlondon"}
    # location_data = {"ctl00$MainContent$rblJobs": "both"}
    form.set_radio(location_data)

    # Submits the form
    response = browser.submit(form, page.url)

    # Gets response as text
    response = response.text

    # Closes the browser
    browser.close()

    return response
示例#3
0
class Session:
    BASE_URL = 'https://m.facebook.com'

    def __init__(self, browser_wrapper):
        self._connected = False
        self._current_html = None
        self._browser_wrapper = browser_wrapper
        self._browser = StatefulBrowser()
        self._browser.addHeaders = [('User-Agent', 'Firefox'),
                                    ('Accept-Language', 'en-US,en;q=0.5')]

    def __del__(self):
        self._dispose()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self._dispose()

    @property
    def connected(self):
        return self._connected

    def log_in(self, username, password):
        try:
            # Log in to non-mobile site is more reliable
            self._browser_wrapper.open(self._browser,
                                       'https://www.facebook.com')
            self._browser.select_form('form[id="login_form"]')
            self._browser['email'] = username
            self._browser['pass'] = password
            self._browser_wrapper.submit_selected(self._browser)
            # Check if we really are in account profile page
            if self._browser.get_current_page().find('form',
                                                     action='/search/top/'):
                self._connected = True
        except:
            raise LogInError(f'Unable to log in as {username}')
        return self

    def log_out(self):
        if self._connected:
            self._browser.close()
            self._connected = False

    def profile_info(self, id_):
        """Retrieve informations for a given profile."""
        self._ensure_connected()
        try:
            self._browser_wrapper.open(self._browser,
                                       f'{Session.BASE_URL}/{id_}')
            name = self._sanitize_title(
                self._browser.get_current_page().find('title').text)
            image = parse_image(self._browser.get_current_page(), name)
            info = parse_info(self._browser.get_current_page())
            return name, image, info
        except:
            return None

    def search(self, query):
        """
        Execute search of a given text returning a tuple with ID,
        descriptions and URI.
        """
        url_query = '+'.join(query.split())
        url_path = f'/search/top/?q={url_query}' \
            if self._connected else f'/public/{url_query}'
        try:
            self._browser_wrapper.open(
                self._browser, f'{Session.BASE_URL}{url_path}{url_query}')
            return parse_search(self._browser.get_current_page(),
                                Session.BASE_URL)
        except:
            return None

    def _ensure_connected(self):
        if not self._connected:
            raise NotConnectedError('No active connection or required login')

    def _sanitize_title(self, title):
        # Handle cases like 'Some One - Home'
        if '-' in title:
            return title.split('-')[0].strip()
        return title

    def _dispose(self):
        if self._connected:
            self.log_out()
示例#4
0
from mechanicalsoup import StatefulBrowser

browser = StatefulBrowser()

browser.open("http://www.python.org/")

browser.follow_link("/blogs/")
#browser.follow_link(text="Python News")
print(browser.get_url())

browser.select_form()
browser.get_current_form().print_summary()

browser["q"] = "Raymond Hettinger"
browser.submit_selected()

print(browser.get_url())
print("-" * 40)
print(browser.links())
browser.close()