def crawl_thread(thread: AttrDict, browser: Browser, page_size: int = 15) -> list:
    page = browser.get(thread.url)
    num_pages = len(page.soup.select('.pagination ul li')) - 1
    num_pages = max(num_pages, 1)
    out = []
    for i in range(num_pages):
        page = browser.get(thread.url + '&start=' + str(page_size * i))
        out += [parse_thread_page(x) for x in page.soup.select('.post')]
    return out
Пример #2
0
def login(username, password):
    config_dir_path = os.path.join(os.path.expanduser('~'), CONFIG_DIR_NAME)
    pickle_path = os.path.join(config_dir_path, 'browser.pickle')
    if os.path.isfile(pickle_path):
        try:
            with open(pickle_path, 'rb') as file:
                data = pickle.load(file)
                if data['username'] == username and \
                        data['password'] == password:
                    return data['browser']
        except:
            pass

    login_url = 'https://www.kaggle.com/account/login'
    browser = Browser()

    login_page = browser.get(login_url)
    login_form = login_page.soup.select("#login-account")[0]
    login_form.select("#UserName")[0]['value'] = username
    login_form.select("#Password")[0]['value'] = password
    login_result = browser.submit(login_form, login_page.url)
    if login_result.url == login_url:
        error = (login_result.soup.select(
            '#standalone-signin .validation-summary-errors')[0].get_text())
        print('There was an error logging in: ' + error)
        sys.exit(1)

    if not os.path.isdir(config_dir_path):
        os.mkdir(config_dir_path, 0o700)

    with open(pickle_path, 'wb') as f:
        pickle.dump(
            dict(username=username, password=password, browser=browser), f)

    return browser
def get_links(start_url: str, browser: Browser, num_pages: int = 1, page_size: int = 50) -> list:
    links = []
    domain = "/".join(start_url.split('/')[0:-1])
    for i in range(num_pages):
        page = browser.get(start_url + '&start=' + str(page_size * i))
        links += [parse_link(x, domain) for x in page.soup.select('.topiclist.topics .row')]
    return links
Пример #4
0
def search(url, keyword, cloudflare_needed=False):
    if cloudflare_needed:
        requests = create_scraper()
        requests = Browser(requests)
    else:
        requests = Browser()
    web_page = requests.get(url)
    forms = web_page.soup.find_all('form')
    search_text = re.compile(r'[\s\S]*[Ss]earch[\s\S]*')
    forms = [form for form in forms if bool(re.search(search_text, str(form)))]
    form = forms[0]
    search_input = form.find('input', attrs={'type': 'text'})
    if search_input is None:
        search_input = form.find('input', attrs={'type': 'search'})
    search_input['value'] = keyword
    for inp in form.find_all(attrs={'type': 'submit'}):
        if bool(re.search(search_text, str(inp))):
            inp['name'] = 'Search'
        else:
            inp['name'] = ''
    if url in form.attrs['action']:
        url = form.attrs['action']
    else:
        url = url + form.attrs['action']
    form = Form(form)
    form.choose_submit('Search')
    return requests.submit(form, url).text
def login(username, password):
    config_dir_path = os.path.join(
        os.path.expanduser('~'),
        '.kaggle-cli'
    )
    pickle_path = os.path.join(
        config_dir_path,
        'browser.pickle'
    )

    if os.path.isfile(pickle_path):
        try:
            with open(pickle_path, 'rb') as file:
                data = pickle.load(file)
                if data['username'] == username and \
                        data['password'] == password:
                    return data['browser']
        except:
            pass

    browser = Browser()
    login_url = 'https://www.kaggle.com/account/login'

    login_page = browser.get(login_url)

    token = re.search(
        'antiForgeryToken: \'(?P<token>.+)\'',
        str(login_page.soup)
    ).group(1)

    login_result_page = browser.post(
        login_url,
        data={
            'username': username,
            'password': password,
            '__RequestVerificationToken': token
        }
    )

    error_match = re.search(
        '"errors":\["(?P<error>.+)"\]',
        str(login_result_page.soup)
    )

    if error_match:
        print(error_match.group(1))
        return

    if not os.path.isdir(config_dir_path):
        os.mkdir(config_dir_path, 0o700)

    with open(pickle_path, 'wb') as f:
        pickle.dump(dict(
            username=username, password=password, browser=browser
        ), f)

    return browser
Пример #6
0
class Kaggle:
    def __init__(self):
        self.browser = Browser()

    def login(self, username, password):
        login_url = 'https://www.kaggle.com/account/login'
        login_page = self.browser.get(login_url)

        token = re.search('antiForgeryToken: \'(?P<token>.+)\'',
                          str(login_page.soup)).group(1)

        login_result_page = self.browser.post(login_url,
                                              data={
                                                  'username':
                                                  username,
                                                  'password':
                                                  password,
                                                  '__RequestVerificationToken':
                                                  token
                                              })
        error_match = re.search('"errors":\["(?P<error>.+)"\]',
                                str(login_result_page.soup))
        if error_match:
            print(error_match.group(1))
            return
        return self.browser

    def download_dataset(self, url, local_file):
        headers = self.browser.request('head', url).headers
        content_length = int(headers['Content-Length'])
        chunk = 1
        if content_length > 0:
            my_file = Path(local_file)
            if my_file.is_file():
                #delete
                os.remove(local_file)
        if content_length > 1024:
            chunk = 1024
        stream = self.browser.get(url, stream=True)
        with open(local_file, 'ab') as f:
            for chunk in stream.iter_content(chunk_size=chunk):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
Пример #7
0
    def _login(self):
        login_url = 'https://www.kaggle.com/account/login'
        browser = Browser()

        login_page = browser.get(login_url)
        login_form = login_page.soup.select("#login-account")[0]
        login_form.select("#UserName")[0]['value'] = self.username
        login_form.select("#Password")[0]['value'] = self.password
        login_result = browser.submit(login_form, login_page.url)
        if login_result.url == login_url:
            error = (login_result.soup
                     .select('#standalone-signin .validation-summary-errors')[0].get_text())
            raise Exception('There was an error logging in: ' + error)

        return browser
Пример #8
0
    def __get_login_browser(self, username, password):

        pickle_path = os.path.join('browser.pickle')
        login_url = 'https://www.kaggle.com/account/login'
        browser = Browser()

        login_page = browser.get(login_url)
        login_form = login_page.soup.select("#login-account")[0]
        login_form.select("#UserName")[0]['value'] = username
        login_form.select("#Password")[0]['value'] = password
        login_result = browser.submit(login_form, login_page.url)
        if login_result.url == login_url:
            error = (login_result.soup.select(
                '#standalone-signin .validation-summary-errors')[0].get_text())
            print('There was an error logging in: ' + error)
            sys.exit(1)

        return browser
Пример #9
0
 def login(self, username, password=None):
     if password is None:
         password = getpass(prompt="Kaggle Password: "******"#login-account")[0]
     login_form.select("#UserName")[0]['value'] = username
     login_form.select("#Password")[0]['value'] = password
     login_result = browser.submit(login_form, login_page.url)
     if len(
             login_result.soup.select(
                 '#standalone-signin .validation-summary-errors')) != 0:
         print(
             "Something went wrong when trying to log you in\nHere is the error from Kaggle\n %s"
             % login_result.soup.select(
                 '#standalone-signin .validation-summary-errors')
             [0].get_text())
     else:
         print("Logged in Successfully")
         self.userSession = browser
Пример #10
0
    def _login(self):
        login_url = 'https://www.kaggle.com/account/login'
        browser = Browser()

        login_page = browser.get(login_url)
        token = re.search('antiForgeryToken: \'(?P<token>.+)\'',
                          str(login_page.soup)).group(1)
        login_result_page = browser.post(login_url,
                                         data={
                                             'username': self.username,
                                             'password': self.password,
                                             '__RequestVerificationToken':
                                             token
                                         })

        error_match = re.search('"errors":\["(?P<error>.+)"\]',
                                str(login_result_page.soup))
        if error_match:
            raise Exception('There was an error logging in: ' +
                            error_match.group(1))

        return browser
Пример #11
0
# so sad mechanicalsoup can't work with javascript
import mechanicalsoup
from mechanicalsoup import Browser
from pprint import pprint
from bs4 import BeautifulSoup

#
url = ""

br = Browser()
page = br.get(url)
form = page.soup.select("form")[1]

# user name
form.find("input", {"name": "DDDDD"})["value"] = ""
# password
form.find("input", {"name": "upass"})["value"] = ""
# is ok to keep that OMNKKey empty
form.find("input", {"name": "0MKKey"})["value"] = ""

# (the method here is __setitem__)
success_page = br.submit(form, page.url)  # submit current form

print("--------------------------------------------------------------------")
print('form texts:')
success_info = success_page.soup.find('form', {'name': 'form1'})
pprint(success_info.get_text())

print("--------------------------------------------------------------------")

print('scripts:')
Пример #12
0
def login(username=None, password=None):
    if username is None:
        username = input('Please provide username: '******'Please provide password: '******'~'),
        CONFIG_DIR_NAME
    )
    pickle_path = os.path.join(
        config_dir_path,
        'browser.pickle'
    )

    if os.path.isfile(pickle_path):
        try:
            with open(pickle_path, 'rb') as file:
                data = pickle.load(file)
                if data['username'] == username and \
                        data['password'] == password:
                    return data['browser']
        except:
            pass

    browser = Browser()
    login_url = 'https://www.kaggle.com/account/login'

    login_page = browser.get(login_url)

    token = re.search(
        'antiForgeryToken: \'(?P<token>.+)\'',
        str(login_page.soup)
    ).group(1)

    login_result_page = browser.post(
        login_url,
        data={
            'username': username,
            'password': password,
            '__RequestVerificationToken': token
        }
    )

    error_match = re.search(
        '"errors":\["(?P<error>.+)"\]',
        str(login_result_page.soup)
    )

    if error_match:
        print(error_match.group(1))
        return

    if not os.path.isdir(config_dir_path):
        os.mkdir(config_dir_path, 0o700)

    with open(pickle_path, 'wb') as f:
        pickle.dump(dict(
            username=username, password=password, browser=browser
        ), f)

    return browser