def crawl_thread(thread: AttrDict, browser: Browser, page_size: int = 15) -> list: page = browser.get(thread.url) num_pages = len(page.soup.select('.pagination ul li')) - 1 num_pages = max(num_pages, 1) out = [] for i in range(num_pages): page = browser.get(thread.url + '&start=' + str(page_size * i)) out += [parse_thread_page(x) for x in page.soup.select('.post')] return out
def login(username, password): config_dir_path = os.path.join(os.path.expanduser('~'), CONFIG_DIR_NAME) pickle_path = os.path.join(config_dir_path, 'browser.pickle') if os.path.isfile(pickle_path): try: with open(pickle_path, 'rb') as file: data = pickle.load(file) if data['username'] == username and \ data['password'] == password: return data['browser'] except: pass login_url = 'https://www.kaggle.com/account/login' browser = Browser() login_page = browser.get(login_url) login_form = login_page.soup.select("#login-account")[0] login_form.select("#UserName")[0]['value'] = username login_form.select("#Password")[0]['value'] = password login_result = browser.submit(login_form, login_page.url) if login_result.url == login_url: error = (login_result.soup.select( '#standalone-signin .validation-summary-errors')[0].get_text()) print('There was an error logging in: ' + error) sys.exit(1) if not os.path.isdir(config_dir_path): os.mkdir(config_dir_path, 0o700) with open(pickle_path, 'wb') as f: pickle.dump( dict(username=username, password=password, browser=browser), f) return browser
def get_links(start_url: str, browser: Browser, num_pages: int = 1, page_size: int = 50) -> list: links = [] domain = "/".join(start_url.split('/')[0:-1]) for i in range(num_pages): page = browser.get(start_url + '&start=' + str(page_size * i)) links += [parse_link(x, domain) for x in page.soup.select('.topiclist.topics .row')] return links
def search(url, keyword, cloudflare_needed=False): if cloudflare_needed: requests = create_scraper() requests = Browser(requests) else: requests = Browser() web_page = requests.get(url) forms = web_page.soup.find_all('form') search_text = re.compile(r'[\s\S]*[Ss]earch[\s\S]*') forms = [form for form in forms if bool(re.search(search_text, str(form)))] form = forms[0] search_input = form.find('input', attrs={'type': 'text'}) if search_input is None: search_input = form.find('input', attrs={'type': 'search'}) search_input['value'] = keyword for inp in form.find_all(attrs={'type': 'submit'}): if bool(re.search(search_text, str(inp))): inp['name'] = 'Search' else: inp['name'] = '' if url in form.attrs['action']: url = form.attrs['action'] else: url = url + form.attrs['action'] form = Form(form) form.choose_submit('Search') return requests.submit(form, url).text
def login(username, password): config_dir_path = os.path.join( os.path.expanduser('~'), '.kaggle-cli' ) pickle_path = os.path.join( config_dir_path, 'browser.pickle' ) if os.path.isfile(pickle_path): try: with open(pickle_path, 'rb') as file: data = pickle.load(file) if data['username'] == username and \ data['password'] == password: return data['browser'] except: pass browser = Browser() login_url = 'https://www.kaggle.com/account/login' login_page = browser.get(login_url) token = re.search( 'antiForgeryToken: \'(?P<token>.+)\'', str(login_page.soup) ).group(1) login_result_page = browser.post( login_url, data={ 'username': username, 'password': password, '__RequestVerificationToken': token } ) error_match = re.search( '"errors":\["(?P<error>.+)"\]', str(login_result_page.soup) ) if error_match: print(error_match.group(1)) return if not os.path.isdir(config_dir_path): os.mkdir(config_dir_path, 0o700) with open(pickle_path, 'wb') as f: pickle.dump(dict( username=username, password=password, browser=browser ), f) return browser
class Kaggle: def __init__(self): self.browser = Browser() def login(self, username, password): login_url = 'https://www.kaggle.com/account/login' login_page = self.browser.get(login_url) token = re.search('antiForgeryToken: \'(?P<token>.+)\'', str(login_page.soup)).group(1) login_result_page = self.browser.post(login_url, data={ 'username': username, 'password': password, '__RequestVerificationToken': token }) error_match = re.search('"errors":\["(?P<error>.+)"\]', str(login_result_page.soup)) if error_match: print(error_match.group(1)) return return self.browser def download_dataset(self, url, local_file): headers = self.browser.request('head', url).headers content_length = int(headers['Content-Length']) chunk = 1 if content_length > 0: my_file = Path(local_file) if my_file.is_file(): #delete os.remove(local_file) if content_length > 1024: chunk = 1024 stream = self.browser.get(url, stream=True) with open(local_file, 'ab') as f: for chunk in stream.iter_content(chunk_size=chunk): if chunk: # filter out keep-alive new chunks f.write(chunk)
def _login(self): login_url = 'https://www.kaggle.com/account/login' browser = Browser() login_page = browser.get(login_url) login_form = login_page.soup.select("#login-account")[0] login_form.select("#UserName")[0]['value'] = self.username login_form.select("#Password")[0]['value'] = self.password login_result = browser.submit(login_form, login_page.url) if login_result.url == login_url: error = (login_result.soup .select('#standalone-signin .validation-summary-errors')[0].get_text()) raise Exception('There was an error logging in: ' + error) return browser
def __get_login_browser(self, username, password): pickle_path = os.path.join('browser.pickle') login_url = 'https://www.kaggle.com/account/login' browser = Browser() login_page = browser.get(login_url) login_form = login_page.soup.select("#login-account")[0] login_form.select("#UserName")[0]['value'] = username login_form.select("#Password")[0]['value'] = password login_result = browser.submit(login_form, login_page.url) if login_result.url == login_url: error = (login_result.soup.select( '#standalone-signin .validation-summary-errors')[0].get_text()) print('There was an error logging in: ' + error) sys.exit(1) return browser
def login(self, username, password=None): if password is None: password = getpass(prompt="Kaggle Password: "******"#login-account")[0] login_form.select("#UserName")[0]['value'] = username login_form.select("#Password")[0]['value'] = password login_result = browser.submit(login_form, login_page.url) if len( login_result.soup.select( '#standalone-signin .validation-summary-errors')) != 0: print( "Something went wrong when trying to log you in\nHere is the error from Kaggle\n %s" % login_result.soup.select( '#standalone-signin .validation-summary-errors') [0].get_text()) else: print("Logged in Successfully") self.userSession = browser
def _login(self): login_url = 'https://www.kaggle.com/account/login' browser = Browser() login_page = browser.get(login_url) token = re.search('antiForgeryToken: \'(?P<token>.+)\'', str(login_page.soup)).group(1) login_result_page = browser.post(login_url, data={ 'username': self.username, 'password': self.password, '__RequestVerificationToken': token }) error_match = re.search('"errors":\["(?P<error>.+)"\]', str(login_result_page.soup)) if error_match: raise Exception('There was an error logging in: ' + error_match.group(1)) return browser
# so sad mechanicalsoup can't work with javascript import mechanicalsoup from mechanicalsoup import Browser from pprint import pprint from bs4 import BeautifulSoup # url = "" br = Browser() page = br.get(url) form = page.soup.select("form")[1] # user name form.find("input", {"name": "DDDDD"})["value"] = "" # password form.find("input", {"name": "upass"})["value"] = "" # is ok to keep that OMNKKey empty form.find("input", {"name": "0MKKey"})["value"] = "" # (the method here is __setitem__) success_page = br.submit(form, page.url) # submit current form print("--------------------------------------------------------------------") print('form texts:') success_info = success_page.soup.find('form', {'name': 'form1'}) pprint(success_info.get_text()) print("--------------------------------------------------------------------") print('scripts:')
def login(username=None, password=None): if username is None: username = input('Please provide username: '******'Please provide password: '******'~'), CONFIG_DIR_NAME ) pickle_path = os.path.join( config_dir_path, 'browser.pickle' ) if os.path.isfile(pickle_path): try: with open(pickle_path, 'rb') as file: data = pickle.load(file) if data['username'] == username and \ data['password'] == password: return data['browser'] except: pass browser = Browser() login_url = 'https://www.kaggle.com/account/login' login_page = browser.get(login_url) token = re.search( 'antiForgeryToken: \'(?P<token>.+)\'', str(login_page.soup) ).group(1) login_result_page = browser.post( login_url, data={ 'username': username, 'password': password, '__RequestVerificationToken': token } ) error_match = re.search( '"errors":\["(?P<error>.+)"\]', str(login_result_page.soup) ) if error_match: print(error_match.group(1)) return if not os.path.isdir(config_dir_path): os.mkdir(config_dir_path, 0o700) with open(pickle_path, 'wb') as f: pickle.dump(dict( username=username, password=password, browser=browser ), f) return browser