def LogIn(): def _insertLF(string, begin=70): spc = string.find(' ', begin) return string[:spc] + '\n' + string[spc + 1:] if spc > 0 else string def _MFACheck(br, email, soup): Log('MFA, DCQ or Captcha form') uni_soup = soup.__unicode__() try: form = br.select_form('form[name="signIn"]') except mechanicalsoup.LinkNotFoundError: form = br.select_form() if 'auth-mfa-form' in uni_soup: msg = soup.find('form', attrs={'id': 'auth-mfa-form'}) msgtxt = msg.p.get_text(strip=True) kb = xbmc.Keyboard('', msgtxt) kb.doModal() if kb.isConfirmed() and kb.getText(): br['otpCode'] = kb.getText() else: return None elif 'ap_dcq_form' in uni_soup: msg = soup.find('div', attrs={'id': 'message_warning'}) g.dialog.ok(g.__plugin__, msg.p.get_text(strip=True)) dcq = soup.find('div', attrs={'id': 'ap_dcq1a_pagelet'}) dcq_title = dcq.find('div', attrs={ 'id': 'ap_dcq1a_pagelet_title' }).get_text(strip=True) q_title = [] q_id = [] for q in dcq.findAll('div', attrs={'class': 'dcq_question'}): if q.span.label: label = q.span.label.get_text(strip=True).replace( ' ', '').replace('\n', '') if q.span.label.span: label = label.replace(str(q.span.label.span), q.span.label.span.text) q_title.append(_insertLF(label)) q_id.append(q.input['id']) sel = g.dialog.select(_insertLF(dcq_title, 60), q_title) if len(q_title) > 1 else 0 if sel < 0: return None ret = g.dialog.input(q_title[sel]) if ret: br[q_id[sel]] = ret else: return None elif ('ap_captcha_img_label' in uni_soup) or ('auth-captcha-image-container' in uni_soup): # form.find_by_type('input', 'text', {'id': 'ap-credential-autofill-hint'}): wnd = _Captcha((getString(30008).split('…')[0]), soup, email) wnd.doModal() if wnd.email and wnd.cap and wnd.pwd: form.set_input({ 'email': wnd.email, 'password': wnd.pwd, 'guess': wnd.cap }) else: return None del wnd elif 'claimspicker' in uni_soup: msg = soup.find('form', attrs={'name': 'claimspicker'}) cs_title = msg.find('div', attrs={ 'class': 'a-row a-spacing-small' }).get_text(strip=True) cs_quest = msg.find('label', attrs={'class': 'a-form-label'}) cs_hint = msg.find(lambda tag: tag.name == 'div' and tag.get( 'class') == ['a-row']).get_text(strip=True) choices = [] if cs_quest: for c in soup.findAll('div', attrs={'data-a-input-name': 'option'}): choices.append((c.span.get_text(strip=True), c.input['name'], c.input['value'])) sel = g.dialog.select( '%s - %s' % (cs_title, cs_quest.get_text(strip=True)), [k[0] for k in choices]) else: sel = 100 if g.dialog.ok(cs_title, cs_hint) else -1 if sel > -1: if sel < 100: form.set_radio({choices[sel][1]: choices[sel][2]}) else: return None elif 'auth-select-device-form' in uni_soup: sd_form = soup.find('form', attrs={'id': 'auth-select-device-form'}) sd_hint = sd_form.parent.p.get_text(strip=True) choices = [] for c in sd_form.findAll('label'): choices.append((c.span.get_text(strip=True), c.input['name'], c.input['value'])) sel = g.dialog.select(sd_hint, [k[0] for k in choices]) if sel > -1: form.set_radio({choices[sel][1]: choices[sel][2]}) else: return None elif 'fwcim-form' in uni_soup: msg = soup.find( 'div', attrs={ 'class': 'a-row a-spacing-micro cvf-widget-input-code-label' }) if msg: ret = g.dialog.input(msg.get_text(strip=True)) if ret: br['code'] = ret else: return None if soup.find('img', attrs={'alt': 'captcha'}): wnd = _Challenge(soup) wnd.doModal() if wnd.cap: submit = soup.find('input', value='verifyCaptcha') form.choose_submit(submit) form.set_input({'cvf_captcha_input': wnd.cap}) else: return None del wnd elif 'validateCaptcha' in uni_soup: wnd = _Challenge(soup) wnd.doModal() if wnd.cap: # MechanicalSoup is using the field names, not IDs # id is captchacharacters, which causes exception to be raised form.set_input({'field-keywords': wnd.cap}) else: return None del wnd elif 'pollingForm' in uni_soup: try: from urlparse import urlparse, parse_qs except ImportError: from urllib.parse import urlparse, parse_qs msg = soup.find( 'span', attrs={ 'class': 'a-size-medium transaction-approval-word-break a-text-bold' }).get_text(strip=True) msg += '\n' rows = soup.find('div', attrs={'id': 'channelDetails'}) for row in rows.find_all('div', attrs={'class': 'a-row'}): msg += re.sub('\\s{2,}', ': ', row.get_text()) pd = _ProgressDialog(msg) pd.show() refresh = time.time() form_id = form_poll = 'pollingForm' per = 0 while True: if per > 99: val = -5 if per < 1: val = 5 per += val pd.sl_progress.setPercent(per) if pd.iscanceled: br = None break if time.time() > refresh + 5: url = br.get_url() br.select_form('form[id="{}"]'.format(form_id)) br.submit_selected() response, soup = _parseHTML(br) form_id = form_poll WriteLog(response.replace(py2_decode(email), '**@**'), 'login-pollingform') stat = soup.find( 'input', attrs={'name': 'transactionApprovalStatus'})['value'] Log(stat) if stat in [ 'TransactionCompleted', 'TransactionCompletionTimeout' ]: parsed_url = urlparse(url) query = parse_qs(parsed_url.query) br.open(query['openid.return_to'][0]) break elif stat in [ 'TransactionExpired', 'TransactionResponded' ]: form_id = 'resend-approval-form' else: refresh = time.time() br.open(url) sleep(0.1) pd.close() return br def _setLoginPW(visible): keyboard = xbmc.Keyboard('', getString(30003)) keyboard.setHiddenInput(visible is False) keyboard.doModal(60000) if keyboard.isConfirmed() and keyboard.getText(): password = keyboard.getText() return password return False class LoginLocked(Exception): pass from contextlib import contextmanager @contextmanager def LoginLock(): try: bLocked = 'false' != getConfig('loginlock', 'false') if not bLocked: writeConfig('loginlock', 'true') yield bLocked except LoginLocked: # Already locked pass except Exception as e: # Something went horribly wrong, release and re-raise writeConfig('loginlock', 'false') raise e else: # All fine, release writeConfig('loginlock', 'false') with LoginLock() as locked: if locked: raise LoginLocked g = Globals() s = Settings() Log('Login') from .users import loadUser, addUser user = getTerritory(loadUser(empty=True)) if False is user[1]: return False user = user[0] password = '' keyboard = xbmc.Keyboard('', getString(30002)) keyboard.doModal() if keyboard.isConfirmed() and keyboard.getText(): email = keyboard.getText() password = _setLoginPW(s.show_pass) if password: cj = requests.cookies.RequestsCookieJar() br = mechanicalsoup.StatefulBrowser( soup_config={'features': 'html.parser'}) br.set_cookiejar(cj) br.session.verify = s.verifySsl caperr = -5 while caperr: Log('Connect to SignIn Page %s attempts left' % -caperr) br.session.headers.update( {'User-Agent': getConfig('UserAgent')}) br.open(user['baseurl'] + ( '/gp/aw/si.html' if not user['pv'] else '/auth-redirect/')) try: form = br.select_form('form[name="signIn"]') except mechanicalsoup.LinkNotFoundError: getUA(True) caperr += 1 WriteLog(str(br.get_current_page()), 'login-si') xbmc.sleep(randint(750, 1500)) else: break else: g.dialog.ok(getString(30200), getString(30213)) return False form.set_input({'email': email, 'password': password}) if 'true' == g.addon.getSetting( 'rememberme') and form.find_by_type( 'input', 'checkbox', {'name': 'rememberMe'}): form.set_checkbox({'rememberMe': True}) br.session.headers.update({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': g.userAcceptLanguages, 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': '/'.join(br.get_url().split('/')[0:3]), 'Upgrade-Insecure-Requests': '1' }) br.submit_selected() response, soup = _parseHTML(br) WriteLog(response.replace(py2_decode(email), '**@**'), 'login') while any(sp in response for sp in [ 'auth-mfa-form', 'ap_dcq_form', 'ap_captcha_img_label', 'claimspicker', 'fwcim-form', 'auth-captcha-image-container', 'validateCaptcha', 'pollingForm', 'auth-select-device-form' ]): br = _MFACheck(br, email, soup) if br is None: return False if not br.get_current_form() is None: br.submit_selected() response, soup = _parseHTML(br) WriteLog(response.replace(py2_decode(email), '**@**'), 'login-mfa') if 'accountFixup' in response: Log('Login AccountFixup') skip_link = br.find_link(id='ap-account-fixup-phone-skip-link') br.follow_link(skip_link) response, soup = _parseHTML(br) WriteLog(response.replace(py2_decode(email), '**@**'), 'login-fixup') if 'action=sign-out' in response: try: usr = re.search( r'action=sign-out[^"]*"[^>]*>[^?]+\s+([^?]+?)\s*\?', response).group(1) except AttributeError: usr = getString(30209) if s.multiuser: usr = g.dialog.input(getString(30135), usr) if not usr: return False user['name'] = usr user['cookie'] = requests.utils.dict_from_cookiejar(cj) remLoginData(False) g.addon.setSetting('login_acc', usr) if not s.multiuser: g.dialog.ok(getString(30215), '{0} {1}'.format(getString(30014), usr)) addUser(user) g.genID() return cj elif 'message_error' in response: writeConfig('login_pass', '') msg = soup.find('div', attrs={'id': 'message_error'}) Log('Login Error: %s' % msg.get_text(strip=True)) g.dialog.ok(getString(30200), msg.get_text(strip=True)) elif 'message_warning' in response: msg = soup.find('div', attrs={'id': 'message_warning'}) Log('Login Warning: %s' % msg.get_text(strip=True)) elif 'auth-error-message-box' in response: msg = soup.find('div', attrs={'id': 'auth-error-message-box'}) Log('Login MFA: %s' % msg.get_text(strip=True)) g.dialog.ok(msg.div.h4.get_text(strip=True), msg.div.div.get_text(strip=True)) elif 'error-slot' in response: msg_title = soup.find('div', attrs={ 'class': 'ap_error_page_title' }).get_text(strip=True) msg_cont = soup.find('div', attrs={ 'class': 'ap_error_page_message' }).get_text(strip=True) Log('Login Error: {}'.format(msg_cont)) g.dialog.ok(msg_title, msg_cont) else: g.dialog.ok(getString(30200), getString(30213)) return False
def test_follow_link_arg(httpbin, expected, kwargs): browser = mechanicalsoup.StatefulBrowser() html = '<a href="/foo">Bar</a><a href="/get">Link</a>' browser.open_fake_page(html, httpbin.url) browser.follow_link(bs4_kwargs=kwargs) assert browser.url == httpbin + expected
def test_no_404(httpbin): browser = mechanicalsoup.StatefulBrowser() resp = browser.open(httpbin + "/nosuchpage") assert resp.status_code == 404
def test_get_set_debug(): browser = mechanicalsoup.StatefulBrowser() # Debug mode is off by default assert not browser.get_debug() browser.set_debug(True) assert browser.get_debug()
def test_get_selected_form_unselected(): browser = mechanicalsoup.StatefulBrowser() browser.open_fake_page('<form></form>') with pytest.raises(AttributeError, match="No form has been selected yet."): browser.form assert browser.get_current_form() is None
def test_user_agent(httpbin): browser = mechanicalsoup.StatefulBrowser(user_agent='007') resp = browser.open(httpbin + "/user-agent") assert resp.json() == {'user-agent': '007'}
#!/usr/bin/python # encoding:utf-8 """ @author:jiat @contact:[email protected] @file:note2.py @time:2018/7/1 14:21 """ import mechanicalsoup # 加载扩展包 browser = mechanicalsoup.StatefulBrowser() # 创建一个浏览器对象 browser.open("http://httpbin.org/") # 打开网页 print(browser.get_url()) # 查看浏览器当前的网站 browser.follow_link("forms") # 追加url print(browser.get_url()) # 查看当前url print(browser.get_current_page()) # 获取当前网页的内容 browser.select_form('form[action="/post"]') # 从当前网页中选择一个表单,参数为CSS选择器 # 填写表单内容 browser["custname"] = "Me" browser["custtel"] = "00 00 0001" browser["custemail"] = "*****@*****.**" browser["size"] = "medium" browser["topping"] = "onion" browser["topping"] = ("bacon", "cheese") browser["comments"] = "This pizza looks really good :-)" # 使用本地浏览器打开当前页面 browser.launch_browser()
def __init__(self, username, password): self.username = username self.password = password self.browser = mechanicalsoup.StatefulBrowser()
def loadDataFromWeb(self, idxUrl): SAFE_GARD_SIZE = 10000 LABEL_DATA_AS_OF_START = "Data as of" LABEL_DATA_AS_OF_END = "|" self.indexComposites = [] # self.generalMethods.printLog("Reuters: self.ric = " + self.ric) # get the web page browser = mechanicalsoup.StatefulBrowser() browser.open(self.reutersUrl + idxUrl) self.reutersIdxPage = browser.get_current_page() # get index name compNameItem = self.reutersIdxPage.find("div", attrs={ "id": "sectionTitle" }).find("h1") self.name = compNameItem.text.replace("\n", "").strip().split(" ")[-1] # get index daily price headerQuoteLabelItems = self.reutersIdxPage.findAll( "div", attrs={"class": "label"}) for headerQuoteLabelItem in headerQuoteLabelItems: if ("price" == headerQuoteLabelItem.text.lower()): closePriceTxt = headerQuoteLabelItem.findNext("div").text if ("--" != closePriceTxt): self.closePrice = float(closePriceTxt.replace(",", "")) else: self.closePrice = 0 elif ("open" == headerQuoteLabelItem.text.lower()): openPriceTxt = headerQuoteLabelItem.findNext("div").text if ("--" != openPriceTxt): self.openPrice = float(openPriceTxt.replace(",", "")) else: self.openPrice = 0 elif ("prev close" == headerQuoteLabelItem.text.lower()): prevClosePriceTxt = headerQuoteLabelItem.findNext("div").text if ("--" != prevClosePriceTxt): self.prevClosePrice = float( prevClosePriceTxt.replace(",", "")) else: self.prevClosePrice = 0 idxInfoTblItems = self.reutersIdxPage.findAll("td", attrs={"class": "label"}) for idxInfoTblItem in idxInfoTblItems: if ("day's high" == idxInfoTblItem.text.lower()): dayHighPriceTxt = idxInfoTblItem.findNext("td").text if ("--" != dayHighPriceTxt): self.dayHighPrice = float(dayHighPriceTxt.replace(",", "")) else: self.dayHighPrice = 0 # self.dayHighPrice = float(idxInfoTblItem.findNext("td").text.replace(",", "")) elif ("day's low" == idxInfoTblItem.text.lower()): dayLowPriceTxt = idxInfoTblItem.findNext("td").text if ("--" != dayLowPriceTxt): self.dayLowPrice = float(dayLowPriceTxt.replace(",", "")) else: self.dayLowPrice = 0 while (len(self.indexComposites) < SAFE_GARD_SIZE): # get the Data As Of Date if (0 == len(self.indexComposites)): asOfDateItem = self.reutersIdxPage.find( "div", attrs={"class": "timestamp"}) rawAsOfDateTxt = asOfDateItem.text.strip( ) #Data as of Fri Feb 2, 2018 | 4:47pm EST. dateStart = asOfDateItem.text.lower().find( LABEL_DATA_AS_OF_START.lower()) + len( LABEL_DATA_AS_OF_START) dateEnd = asOfDateItem.text.lower().find( LABEL_DATA_AS_OF_END.lower()) rawAsOfDateTxt = rawAsOfDateTxt[dateStart:dateEnd].strip() try: self.asOfDate = datetime.datetime.strptime( rawAsOfDateTxt, "%a %b %d, %Y") except ValueError as timeError: self.asOfDate = datetime.datetime.now() # get the composites instrument header compoInstrTblItem = self.reutersIdxPage.find( "table", attrs={"class": "dataTable sortable"}) compoInstrRow = [] compoInstrRow.append("Data_Record_Date") compoInstrRow.append("Ticker") compoInstrRow.append("RIC") tblHeaderItem = compoInstrTblItem.findAll("th") for tblHeader in tblHeaderItem: compoInstrRow.append(tblHeader.text.strip()) compoInstrRow.append("Date_As_Of_Date") self.indexComposites.append(compoInstrRow) # get the composites instrument data compoInstrTblItem = self.reutersIdxPage.find( "table", attrs={"class": "dataTable sortable"}) tblRowItem = compoInstrTblItem.findAll("tr") for tblRow in tblRowItem: compoInstrRow = [] instrName = "" instrTicker = "" instrRic = "" compoInstrRow.append(self.dataRecordDate) tblCells = tblRow.findAll("td") for tblCell in tblCells: instrLink = tblCell.find("a") if (None != instrLink): instrName = instrLink.text.strip() instrTicker = instrLink.attrs["href"].split( "/")[-1].split(".")[0] instrRic = instrLink.attrs["href"].split("/")[-1] compoInstrRow.append(instrTicker) compoInstrRow.append(instrRic) compoInstrRow.append(instrName) else: compoInstrRow.append(tblCell.text.strip()) if (len(compoInstrRow) == 8): compoInstrRow.append( self.asOfDate.strftime( clsGeneralConstants.GeneralConstants. DATA_RECORD_DATE_FORMAT)) self.indexComposites.append(compoInstrRow) #get the next button pageNavItem = self.reutersIdxPage.find( "div", attrs={"class": "pageNavigation"}) nextButItem = pageNavItem.find('li', attrs={"class": "next"}) if (None != nextButItem): nextLinkItem = nextButItem.find("a") browser.open(self.reutersUrl + nextLinkItem.attrs["href"]) self.reutersIdxPage = browser.get_current_page() else: break return None
def trigger(self): raw_headers = None data_2fa = { 'type': None, 'code': None, 'name': None, 'action': None, 'headers': [], 'cookies': [], } try: browser = mechanicalsoup.StatefulBrowser( soup_config={'features': 'html'}, raise_on_404=True, user_agent='Python-urllib/2.7', ) page = browser.open('https://www.gmail.com') user_form = browser.select_form('form') user_form.set('Email', self.user) user_response = browser.submit(user_form, page.url) pass_form = mechanicalsoup.Form(user_response.soup.form) pass_form.set('Passwd', self.password) pass_response = browser.submit(pass_form, page.url) raw_headers = pass_response.headers soup = pass_response.soup raw = soup.text sms = soup.find('input', {'id': 'idvPreregisteredPhonePin'}) sms_old = soup.find('button', {'id': 'idvPreresteredPhoneSms'}) u2f = soup.find('input', {'id': 'id-challenge'}) touch = soup.find('input', {'id': 'authzenToken'}) authenticator = soup.find('input', {'id': 'totpPin'}) backup = soup.find('input', {'id': 'backupCodePin'}) if sms or sms_old: data_2fa['type'] = 'sms' if sms_old: final_form = mechanicalsoup.Form(pass_response.soup.form) final_response = browser.submit(final_form, page.url) raw_headers = final_response.headers raw = final_response.soup.text data_2fa['type'] = 'u2f' code = '' regexes = [ r"\d{2}(?=</b>)", r"(?<=\u2022)\d{2}(?=G)", r"\d{2}(?=G)", r"\d{2}(?=\</b>)", r"\d{2}(?=S)", ] for regex in regexes: matches = re.search(regex, raw, re.UNICODE) if matches: code = matches.group() break else: code = '••' data_2fa['code'] = code elif u2f: data_2fa['type'] = 'u2f' elif touch: code = '' name = '' regex_codes = [ r"(?<=<b>)\d{1,3}(?=</b>)", r"(?<=then tap )\d{1,3}(?= on your phone)" ] for regex_code in regex_codes: code_match = re.search(regex_code, raw) if code_match: code = code_match.group() else: code = 0 regex_names = [ r"(?<=Unlock your ).*(?=Tap)", r"(?<=Check your ).*(?=<\/h2>)", ] for regex_name in regex_names: name_match = re.search(regex_name, raw) if name_match: name = name_match.group() else: name = 'phone' data_2fa['code'] = code data_2fa['name'] = name data_2fa['type'] = 'touchscreen' elif authenticator: name = '' regexes = [ r"(?<=Get a verification code from the <strong>).*(?=<\/strong>)", r"(?<=Get a verification code from the ).*(?= app)", ] for regex in regexes: name_match = re.search(regex, raw, re.UNICODE) if name_match: name = name_match.group() else: name = 'authenticator app' data_2fa['name'] = name data_2fa['type'] = 'authenticator' elif backup: data_2fa['type'] = 'backup' else: if 'Try again in a few hours' in raw: data_2fa['error'] = 'locked out' data_2fa['action'] = 'redirect' cookies = [] for c in browser.get_cookiejar(): cookie = {} cookie['name'] = c.name cookie['value'] = c.value cookie['domain'] = c.domain cookie['path'] = c.path cookie['secure'] = c.secure cookie['expires'] = c.expires cookies.append(cookie) data_2fa['cookies'] = cookies for h in raw_headers: header = {} header['name'] = h header['value'] = raw_headers[h] data_2fa['headers'].append(header) except Exception as ex: data_2fa['error'] = ex pass return data_2fa
def main(): global browser browser.open(URL_MAIN_PAGE) page = browser.get_current_page() urls = page.find('div', { 'class': 'c2' }).find_next_sibling('ul').find_all('a', href=True) urls = list(map(lambda x: x['href'], urls)) result = [] print('Retrieving rows.. ') print(f'{len(urls)} urls to query\n') i = 0 for url in urls: browser = mechanicalsoup.StatefulBrowser() browser.open(url) page = browser.get_current_page() city = page.find('h1', {"class": "title"}).text[:-21] i += 1 print(f'{i} => {city} \t') for table in page.find_all('table', {"class": "wp-table-reloaded"}): tag_district = table.find_previous_sibling('h2') if tag_district is not None: district = tag_district.text index = district.index('Veteriner') district = district[:index] else: district = '' for tr in table.find_all('tr'): obj = { 'category': 'Veteriner Klinikleri', 'city': city, 'district': district } td_name = tr.find('td', {"class": "column-1"}) name = td_name.text if td_name is not None else '' td_address = tr.find('td', {"class": "column-2"}) address = td_address.text if td_address is not None else '' td_phone = tr.find('td', {"class": "column-3"}) phone = td_phone.text if td_phone is not None else '' obj['name'] = name obj['address'] = address obj['phone'] = phone if name != '': result.append(obj) filename = 'JSON' write_json(filename, result) print(f'{len(result)} rows writed in file \'{filename}\' ')
def test_choose_submit_multiple_match(): browser = mechanicalsoup.StatefulBrowser() browser.open_fake_page(choose_submit_multiple_match_form) form = browser.select_form('#choose-submit-form') with pytest.raises(mechanicalsoup.utils.LinkNotFoundError): form.choose_submit('test_submit')
def getCallsList(desc, url): # crear la lista de convocatorias callsList = [] total_calls = getTotalCalls(url) calls_max_index = 9 pages_max_index = int(total_calls / calls_max_index) + 1 progress_bar = tqdm(total=total_calls, desc=desc) ### INICIAR LA NAVEGACIÖN # entrar a cada una de las páginas for page in range(1, pages_max_index + 1): # entrar a cada una de las convocatorias desplegadas en esa página for call in range(0, calls_max_index + 1): # si aún no hemos llegado al final if (progress_bar.n < progress_bar.total): # se define el browser con el user_agent browser = mechanicalsoup.StatefulBrowser(user_agent=userAgent) browser.open(url) # se abre la URL # LISTAR TODAS LAS CONVOCATORIAS # esta página muestra las becas solo cuando se da click en la opción "Todas" # seleccionamos el formulario browser.select_form("#form1") # seleccionamos que busque con la opción "Todas" browser["RBLOpcionBuscar"] = "Todas" # enviamos el fomulario y capturamos las respuestas response = browser.submit_selected() # en este punto ya tenemos la tabla #GVConvocatorias con la lista de becas # hay que proceder a realizar la extracción de datos teniendo en cuenta la paginación if (page > 1): # avanzar en paginación, sólo si es una página posterior a la 11 if (page > 11): for jump_page in range(11, page): # no hay necesidad de ir página por página, sino de 10 en 10 e.j 11, 21, 31... if jump_page % 10 == 1: browser.select_form("#form1") # añadir los parámetros escondidos, usar force=True # opción de consulta de convocatoria browser.get_current_form().set( "__EVENTTARGET", "GVConvocatorias", True) # identificador de la convocatoria browser.get_current_form().set( "__EVENTARGUMENT", "Page$" + str(jump_page), True) # enviar el formulario que consulta la convocatoria específica responsePage = browser.submit_selected() # ir a la páginación especifica browser.select_form("#form1") # añadir los parámetros escondidos, usar force=True # opción de consulta de convocatoria browser.get_current_form().set("__EVENTTARGET", "GVConvocatorias", True) # identificador de la convocatoria browser.get_current_form().set("__EVENTARGUMENT", "Page$" + str(page), True) # enviar el formulario que consulta la convocatoria específica responsePage = browser.submit_selected() # entrar a la convocatoria específica browser.select_form("#form1") # añadir los parámetros escondidos, usar force=True # opción de consulta de convocatoria browser.get_current_form().set("__EVENTTARGET", "GVConvocatorias", True) # identificador de la convocatoria browser.get_current_form().set("__EVENTARGUMENT", "$" + str(call), True) # enviar el formulario que consulta la convocatoria específica responseCall = browser.submit_selected() # OBTENER DATOS DE LA CONVOCATORIA # obtener el soup a partir de la respuesta a la acción anterior soup = BeautifulSoup(responseCall.text, "html.parser") # obetner el ID de la convocatoria ID_element = soup.find("span", {"id": ID_label}) if ID_element is not None: ID_value = ID_element.text # crear diccionario de la convocatoria dict = {} # agregar el ID dict.update({ID_label: ID_value}) # obtener cada fila de la tabla las cuales contienen los datos rows = soup.findAll("span", {"class": "label1"}) # alimentar el diccionario con las filas de la convocatoria for row in rows: # si es un un campo que nos interesa if row.has_attr("id") & terms_dict.keys().__contains__( row["id"]): value = cleanValue(row.text) dict.update({row["id"]: value}) modalitiesTable = soup.find("table", {"id": "GVNumeroBecas"}) if modalitiesTable is not None: modalities = modalitiesTable.findAll("tr") tittles = modalitiesTable.findAll("th") # diferenciar beca por cada una de las modalidades que se ofrece for modality in modalities: dict2 = copy.copy(dict) # obtener el contenido de las celdas cells = modality.findAll('td') # si efectivamente es una de las filas de contenido if (len(cells) == 4): for j in range(0, 4): # si es un campo que nos interesa if (terms_dict.keys().__contains__( tittles[j].text)): value = cleanValue(cells[j].text) dict2.update({tittles[j].text: value}) callsList.append(dict2) else: callsList.append(dict) # cerrar el navegador browser.close() # actualizar la barra de progreso progress_bar.update(1) # añadiendo delay # time.sleep(0.1) # cerrar barra de progreso progress_bar.close() return callsList
def __init__(self): self._browser = mechanicalsoup.StatefulBrowser() self._logged_in = False
def prepare_mock_browser(scheme='mock'): mock = requests_mock.Adapter() browser = mechanicalsoup.StatefulBrowser(requests_adapters={scheme: mock}) return browser, mock
def __init__(self): self.browser = mechanicalsoup.StatefulBrowser() self.bonus = r'Bonus : (.*)' self.offering = r'Récupérer ([0-9]+) (.+) et rapporter .*' self.description = r'\n(.*)\nQuête.*' self.url = 'http://www.krosmoz.com/fr/almanax/%s?game=dofustouch'
def test_link_arg_regex(httpbin): browser = mechanicalsoup.StatefulBrowser() browser.open_fake_page('<a href="/get">Link</a>', httpbin.url) browser.follow_link(url_regex='.*') assert browser.get_url() == httpbin + '/get'
def maybe_post_on_craigslist(bounty): import time import mechanicalsoup from app.utils import fetch_last_email_id, fetch_mails_since_id craigslist_url = 'https://boulder.craigslist.org/' max_urls = 10 browser = mechanicalsoup.StatefulBrowser() browser.open(craigslist_url) # open craigslist post_link = browser.find_link(attrs={'id': 'post'}) page = browser.follow_link(post_link) # scraping the posting page link form = page.soup.form # select 'gig offered (I'm hiring for a short-term, small or odd job)' form.find('input', {'type': 'radio', 'value': 'go'})['checked'] = '' page = browser.submit(form, form['action']) form = page.soup.form # select 'I want to hire someone' form.find('input', {'type': 'radio', 'value': 'G'})['checked'] = '' page = browser.submit(form, form['action']) form = page.soup.form # select 'computer gigs (small web design, tech support, etc projects )' form.find('input', {'type': 'radio', 'value': '110'})['checked'] = '' page = browser.submit(form, form['action']) form = page.soup.form # keep selecting defaults for sub area etc till we reach edit page # this step is to ensure that we go over all the extra pages which appear on craigslist only in some locations # this choose the default skip options in craigslist for i in range(max_urls): if page.url.endswith('s=edit'): break # Chooses the first default if page.url.endswith('s=subarea'): form.find_all('input')[1]['checked'] = '' else: form.find_all('input')[0]['checked'] = '' page = browser.submit(form, form['action']) form = page.soup.form else: # for-else magic # if the loop completes normally that means we are still not at the edit page # hence return and don't proceed further print('returning at first return') return posting_title = bounty.title if not posting_title: posting_title = f"Please turn around {bounty.org_name}’s issue" posting_body = f"Solve this github issue: {bounty.github_url}" # Final form filling form.find('input', {'id': "PostingTitle"})['value'] = posting_title form.find('textarea', {'id': "PostingBody"}).insert(0, posting_body) form.find('input', {'id': "FromEMail"})['value'] = settings.IMAP_EMAIL form.find('input', {'id': "ConfirmEMail"})['value'] = settings.IMAP_EMAIL for postal_code_input in form.find_all('input', {'id': "postal_code"}): postal_code_input['value'] = '94105' form.find('input', {'value': 'pay', 'name': 'remuneration_type'})['checked'] = '' form.find('input', {'id': "remuneration"})['value'] = f"{bounty.get_natural_value()} {bounty.token_name}" try: form.find('input', {'id': "wantamap"})['data-checked'] = '' except Exception: pass page = browser.submit(form, form['action']) # skipping image upload form = page.soup.find_all('form')[-1] page = browser.submit(form, form['action']) for i in range(max_urls): if page.url.endswith('s=preview'): break # Chooses the first default page = browser.submit(form, form['action']) form = page.soup.form else: # for-else magic # if the loop completes normally that means we are still not at the edit page # hence return and don't proceed further print('returning at 2nd return') return # submitting final form form = page.soup.form # getting last email id last_email_id = fetch_last_email_id(settings.IMAP_EMAIL, settings.IMAP_PASSWORD) page = browser.submit(form, form['action']) time.sleep(10) last_email_id_new = fetch_last_email_id(settings.IMAP_EMAIL, settings.IMAP_PASSWORD) # if no email has arrived wait for 5 seconds if last_email_id == last_email_id_new: # could slow responses if called syncronously in a request time.sleep(5) emails = fetch_mails_since_id(settings.IMAP_EMAIL, settings.IMAP_PASSWORD, last_email_id) for _, content in emails.items(): if 'craigslist' in content['from']: for link in re.findall(r"(?:https?:\/\/[a-zA-Z0-9%]+[.]+craigslist+[.]+org/[a-zA-Z0-9\/\-]*)", content.as_string()): # opening all links in the email try: browser = mechanicalsoup.StatefulBrowser() page = browser.open(link) form = page.soup.form page = browser.submit(form, form['action']) return link except Exception: # in case of invalid links return False return False
from __future__ import print_function import argparse import mechanicalsoup from getpass import getpass parser = argparse.ArgumentParser(description="Login to GitHub.") parser.add_argument("username") args = parser.parse_args() args.password = getpass("Please enter your GitHub password: "******"https://github.com") browser.follow_link("login") browser.select_form('#login form') browser["login"] = args.username browser["password"] = args.password resp = browser.submit_selected() # Uncomment to launch a web browser on the current page: # browser.launch_browser() # verify we are now logged in page = browser.page
def main(): # Set CWD to script directory os.chdir(sys.path[0]) # Read config config = configparser.ConfigParser() try: dataset = config.read('config.ini') if not dataset: print( 'config.ini does not exist. Please rename config-template.ini if you have not done so already.' ) except configparser.Error as e: print(f'ConfigParser error: {e}') # Authenticate with Google API early service = google_api_authenticate() # Init browser browser = mechanicalsoup.StatefulBrowser() init_browser(browser) # Attempt login response = login(browser, config['AUTH']['FB_EMAIL'], config['AUTH']['FB_PASS']) if response.status_code != 200: sys.exit( f'Failed to authenticate with Facebook. Status code: {response.status_code}.' ) # Check to see if login failed if response.soup.find('link', { 'rel': 'canonical', 'href': 'https://www.facebook.com/login/' }): sys.exit( 'Failed to authenticate with Facebook. Please check provided email/password.' ) # Check to see if we hit Facebook security checkpoint if response.soup.find('button', {'id': 'checkpointSubmitButton'}): sys.exit( 'Hit Facebook security checkpoint. Please login to Facebook manually and follow prompts to authorize this device.' ) # Get birthday objects for all friends via async endpoint birthdays = get_async_birthdays(browser) if len(birthdays) == 0: sys.exit( 'Birthday list is empty. Failed to fetch any birthdays. Aborting.') # Create birthdays ICS file c = populate_birthdays_calendar(birthdays) # Remove blank lines ics_str = ''.join([line.rstrip('\n') for line in c]) # Upload to drive metadata = {'name': config['DRIVE']['ICS_FILE_NAME']} UPLOAD_RETRY_ATTEMPTS = 3 for attempt in range(UPLOAD_RETRY_ATTEMPTS): try: updated_file = upload_and_replace_file( service, config['DRIVE']['DRIVE_FILE_ID'], metadata, bytearray(ics_str, 'utf-8')) # Pass payload as bytes config.set('DRIVE', 'DRIVE_FILE_ID', updated_file['id']) except HttpError as err: if err.resp.status == 404: # file not found if config['DRIVE']['DRIVE_FILE_ID']: config.set('DRIVE', 'DRIVE_FILE_ID', '') # reset stored file_id print( f"HttpError 404 error. File not found: {config['DRIVE']['DRIVE_FILE_ID']}. Resetting stored file_id in config and trying again. Attempt: {attempt+1}", file=sys.stderr) continue else: print(f'HttpError 404 error. Unexpected error.', file=sys.stderr) # Update config file with updated file id for subsequent runs with open('config.ini', 'w') as configfile: config.write(configfile)
startMonth = str(dateE[0].month) startYear = str(dateE[0].year) endDay = str(dateE[1].day) endMonth = str(dateE[1].month) endYear = str(dateE[1].year) tanggal = startDay+"/"+startMonth+"/"+startYear + " to " + endDay+"/"+endMonth+"/"+endYear print(tanggal) ####Login ke Repo gempa Indonesia print("Log in to repogempa .........") loginUrl = "http://repogempa.bmkg.go.id/login.php" br = mechanicalsoup.StatefulBrowser() br.open(loginUrl) br.select_form() br["userid"] = "bmkg" br["passwd"] = "g3mp4Bumi" br.submit_selected() ################################## ####Memasukkan detail data scraping dari use ke form print("Input necessary data .........") br.select_form()
"""Example usage of MechanicalSoup to get the results from the Qwant search engine. """ import re import urllib.parse import mechanicalsoup # Connect to duckduckgo browser = mechanicalsoup.StatefulBrowser(user_agent='MechanicalSoup') browser.open("https://lite.qwant.com/") # Fill-in the search form browser.select_form('#search-form') browser["q"] = "MechanicalSoup" browser.submit_selected() # Display the results for link in browser.page.select('.result a'): # Qwant shows redirection links, not the actual URL, so extract # the actual URL from the redirect link: href = link.attrs['href'] m = re.match(r"^/redirect/[^/]*/(.*)\?.*$", href) if m: href = urllib.parse.unquote(m.group(1)) print(link.text, '->', href)
def test_find_link(): browser = mechanicalsoup.StatefulBrowser() browser.open_fake_page('<html></html>') with pytest.raises(mechanicalsoup.LinkNotFoundError): browser.find_link('nosuchlink')
def create_a_browser(parser='html5lib'): """ create a browser object, and store it in the global variable __browser___ as well as returning it to the caller""" __browser__ = ms.StatefulBrowser(soup_config={'features': 'html5lib'}) return __browser__
def test_with(): """Test that __enter__/__exit__ properly create/close the browser.""" with mechanicalsoup.StatefulBrowser() as browser: assert browser.session is not None assert browser.session is None
def LogIn(ask=True): def _insertLF(string, begin=70): spc = string.find(' ', begin) return string[:spc] + '\n' + string[spc + 1:] if spc > 0 else string def _MFACheck(br, email, soup): Log('MFA, DCQ or Captcha form') uni_soup = soup.__unicode__() try: form = br.select_form('form[name="signIn"]') except mechanicalsoup.LinkNotFoundError: form = br.select_form() if 'auth-mfa-form' in uni_soup: msg = soup.find('form', attrs={'id': 'auth-mfa-form'}) msgtxt = msg.p.get_text(strip=True) kb = xbmc.Keyboard('', msgtxt) kb.doModal() if kb.isConfirmed() and kb.getText(): br['otpCode'] = kb.getText() else: return None elif 'ap_dcq_form' in uni_soup: msg = soup.find('div', attrs={'id': 'message_warning'}) g.dialog.ok(g.__plugin__, msg.p.get_text(strip=True)) dcq = soup.find('div', attrs={'id': 'ap_dcq1a_pagelet'}) dcq_title = dcq.find('div', attrs={ 'id': 'ap_dcq1a_pagelet_title' }).get_text(strip=True) q_title = [] q_id = [] for q in dcq.findAll('div', attrs={'class': 'dcq_question'}): if q.span.label: label = q.span.label.get_text(strip=True).replace( ' ', '').replace('\n', '') if q.span.label.span: label = label.replace(str(q.span.label.span), q.span.label.span.text) q_title.append(_insertLF(label)) q_id.append(q.input['id']) sel = g.dialog.select(_insertLF(dcq_title, 60), q_title) if len(q_title) > 1 else 0 if sel < 0: return None ret = g.dialog.input(q_title[sel]) if ret: br[q_id[sel]] = ret else: return None elif ('ap_captcha_img_label' in uni_soup) or ('auth-captcha-image-container' in uni_soup): # form.find_by_type('input', 'text', {'id': 'ap-credential-autofill-hint'}): wnd = _Captcha((getString(30008).split('…')[0]), soup, email) wnd.doModal() if wnd.email and wnd.cap and wnd.pwd: form.set_input({ 'email': wnd.email, 'password': wnd.pwd, 'guess': wnd.cap }) else: return None del wnd elif 'claimspicker' in uni_soup: msg = soup.find('form', attrs={'name': 'claimspicker'}) cs_title = msg.find('div', attrs={ 'class': 'a-row a-spacing-small' }).get_text(strip=True) cs_quest = msg.find('label', attrs={'class': 'a-form-label'}) cs_hint = msg.find(lambda tag: tag.name == 'div' and tag.get( 'class') == ['a-row']).get_text(strip=True) choices = [] if cs_quest: for c in soup.findAll('div', attrs={'data-a-input-name': 'option'}): choices.append((c.span.get_text(strip=True), c.input['name'], c.input['value'])) sel = g.dialog.select( '%s - %s' % (cs_title, cs_quest.get_text(strip=True)), [k[0] for k in choices]) else: sel = 100 if g.dialog.ok(cs_title, cs_hint) else -1 if sel > -1: if sel < 100: form.set_radio({choices[sel][1]: choices[sel][2]}) else: return None elif 'fwcim-form' in uni_soup: msg = soup.find( 'div', attrs={ 'class': 'a-row a-spacing-micro cvf-widget-input-code-label' }).get_text(strip=True) ret = g.dialog.input(msg) if ret: br['code'] = ret else: return None return br def _setLoginPW(): keyboard = xbmc.Keyboard('', getString(30003)) keyboard.doModal(60000) if keyboard.isConfirmed() and keyboard.getText(): password = keyboard.getText() return password return False def _getmac(): mac = uuid.getnode() if (mac >> 40) % 2: mac = node() return uuid.uuid5(uuid.NAMESPACE_DNS, str(mac)).bytes def _encode(data): k = triple_des(_getmac(), CBC, b"\0\0\0\0\0\0\0\0", padmode=PAD_PKCS5) d = k.encrypt(data) return b64encode(d).decode('utf-8') def _decode(data): if not data: return '' k = triple_des(_getmac(), CBC, b"\0\0\0\0\0\0\0\0", padmode=PAD_PKCS5) d = k.decrypt(b64decode(data)) return d class LoginLocked(Exception): pass from contextlib import contextmanager @contextmanager def LoginLock(): try: bLocked = 'false' != getConfig('loginlock', 'false') if not bLocked: writeConfig('loginlock', 'true') yield bLocked except LoginLocked: # Already locked pass except Exception as e: # Something went horribly wrong, release and re-raise writeConfig('loginlock', 'false') raise e else: # All fine, release writeConfig('loginlock', 'false') with LoginLock() as locked: if locked: raise LoginLocked g = Globals() s = Settings() Log('Login') from .users import loadUser, addUser user = loadUser(empty=ask) email = user['email'] password = _decode(user['password']) savelogin = False # g.addon.getSetting('save_login') == 'true' useMFA = False if not user['baseurl']: user = getTerritory(user) if False is user[1]: return False user = user[0] if ask: keyboard = xbmc.Keyboard(email, getString(30002)) keyboard.doModal() if keyboard.isConfirmed() and keyboard.getText(): email = keyboard.getText() password = _setLoginPW() else: if not email or not password: g.dialog.notification(getString(30200), getString(30216)) xbmc.executebuiltin('Addon.OpenSettings(%s)' % g.addon.getAddonInfo('id')) return False if password: cj = requests.cookies.RequestsCookieJar() br = mechanicalsoup.StatefulBrowser( soup_config={'features': 'html.parser'}) br.set_cookiejar(cj) br.session.verify = s.verifySsl caperr = -5 while caperr: Log('Connect to SignIn Page %s attempts left' % -caperr) br.session.headers.update( {'User-Agent': getConfig('UserAgent')}) br.open(user['baseurl'] + ( '/gp/aw/si.html' if not user['pv'] else '/auth-redirect/')) try: form = br.select_form('form[name="signIn"]') except mechanicalsoup.LinkNotFoundError: getUA(True) caperr += 1 WriteLog(str(br.get_current_page()), 'login-si') xbmc.sleep(randint(750, 1500)) else: break else: g.dialog.ok(getString(30200), getString(30213)) return False form.set_input({'email': email, 'password': password}) if 'true' == g.addon.getSetting( 'rememberme') and form.find_by_type( 'input', 'checkbox', {'name': 'rememberMe'}): form.set_checkbox({'rememberMe': True}) br.session.headers.update({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': g.userAcceptLanguages, 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': '/'.join(br.get_url().split('/')[0:3]), 'Upgrade-Insecure-Requests': '1' }) br.submit_selected() response, soup = _parseHTML(br) WriteLog(response.replace(py2_decode(email), '**@**'), 'login') while any(sp in response for sp in [ 'auth-mfa-form', 'ap_dcq_form', 'ap_captcha_img_label', 'claimspicker', 'fwcim-form', 'auth-captcha-image-container' ]): br = _MFACheck(br, email, soup) if br is None: return False useMFA = True if br.get_current_form().form.find( 'input', {'name': 'otpCode'}) else False br.submit_selected() response, soup = _parseHTML(br) WriteLog(response.replace(py2_decode(email), '**@**'), 'login-mfa') if 'accountFixup' in response: Log('Login AccountFixup') skip_link = br.find_link(id='ap-account-fixup-phone-skip-link') br.follow_link(skip_link) response, soup = _parseHTML(br) WriteLog(response.replace(py2_decode(email), '**@**'), 'login-fixup') if 'action=sign-out' in response: try: usr = re.search( r'action=sign-out[^"]*"[^>]*>[^?]+\s+([^?]+?)\s*\?', response).group(1) except AttributeError: usr = getString(30209) if s.multiuser and ask: usr = g.dialog.input(getString(30135), usr) if not usr: return False if useMFA: g.addon.setSetting('save_login', 'false') savelogin = False user['name'] = usr user['email'] = user['password'] = user['cookie'] = '' if savelogin: user['email'] = email user['password'] = _encode(password) else: user['cookie'] = requests.utils.dict_from_cookiejar(cj) if ask: remLoginData(False) g.addon.setSetting('login_acc', usr) if not s.multiuser: g.dialog.ok(getString(30215), '{0} {1}'.format(getString(30014), usr)) addUser(user) g.genID() return cj elif 'message_error' in response: writeConfig('login_pass', '') msg = soup.find('div', attrs={'id': 'message_error'}) Log('Login Error: %s' % msg.p.get_text(strip=True)) g.dialog.ok(getString(30200), getString(30201)) elif 'message_warning' in response: msg = soup.find('div', attrs={'id': 'message_warning'}) Log('Login Warning: %s' % msg.p.get_text(strip=True)) elif 'auth-error-message-box' in response: msg = soup.find('div', attrs={'class': 'a-alert-content'}) Log('Login MFA: %s' % msg.ul.li.span.get_text(strip=True)) g.dialog.ok(getString(30200), getString(30214)) elif 'error-slot' in response: msg_title = soup.find('div', attrs={ 'class': 'ap_error_page_title' }).get_text(strip=True) msg_cont = soup.find('div', attrs={ 'class': 'ap_error_page_message' }).get_text(strip=True) Log('Login Error: {}'.format(msg_cont)) g.dialog.ok(msg_title, msg_cont) else: g.dialog.ok(getString(30200), getString(30213)) return False
def test_link_arg_multiregex(httpbin): browser = mechanicalsoup.StatefulBrowser() browser.open_fake_page('<a href="/get">Link</a>', httpbin.url) with pytest.raises(ValueError, match="link parameter cannot be .*"): browser.follow_link('foo', bs4_kwargs={'url_regex': 'bar'})
"""Example app to login to GitHub using the StatefulBrowser class.""" from __future__ import print_function import argparse import mechanicalsoup from getpass import getpass parser = argparse.ArgumentParser(description="Login to GitHub.") parser.add_argument("username") args = parser.parse_args() args.password = getpass("Please enter your GitHub password: "******"https://github.com") browser.follow_link("login") browser.select_form('#login form') browser["login"] = args.username browser["password"] = args.password resp = browser.submit_selected() # Uncomment to launch a web browser on the current page: # browser.launch_browser() # verify we are now logged in page = browser.get_current_page() messages = page.find("div", class_="flash-messages")
def test_404(httpbin): browser = mechanicalsoup.StatefulBrowser(raise_on_404=True) with pytest.raises(mechanicalsoup.LinkNotFoundError): resp = browser.open(httpbin + "/nosuchpage") resp = browser.open(httpbin.url) assert resp.status_code == 200
def test_iGEM_login_invalid_password(credentials, config, caplog): credentials['password'] = '******' browser = mechanicalsoup.StatefulBrowser() assert not iGEM_login(browser, credentials, config) assert 'the password is not' in caplog.text