def __search_by_config__(config): ''' Returns an array of elements with: name, detail, url ''' # ----- start browser ----- browser = RoboBrowser(history=True) browser.parser = "lxml" query_url = __get_url__(config["QUERY_TEXT"]) log ("Query Url: \t\t" + query_url) browser.open(query_url) all_content = browser.find_all().__str__() has_init_results = NO_RESULT_STR in all_content log ("Has initial results: \t" + has_init_results.__str__()) # ----- create raw results list ----- all_divs = browser.select("div") results = [] current_result = {} for div in all_divs: div_id = div.get("id") if (div_id == "menu_detalle_buscador"): current_result = {} current_result["name"] = div.select("#menu_titulo_buscador a")[0].text # print dir(div) if (div_id == "buscador_detalle"): current_result["detail"] = div.select("#buscador_detalle_sub")[0].text current_links = div.select("#buscador_detalle_sub_datos a") for current_link in current_links: current_link_href = current_link.get("href").__str__() if "bajar.php" in current_link_href: current_result["url"] = current_link_href results.append(current_result) # ----- apply regex ----- result_divs = filter(lambda item: __match_exp__(config["REFINE_REGEX"], item["detail"]) , results) # ----- apply limit ----- result_divs = result_divs[:config["RESULT_LIMIT"]] return result_divs
def authenticate(self, username=None, password=None): login_url = 'https://www.mymedicare.gov/default.aspx' rb = RoboBrowser() rb.parser = 'lxml' rb.open(login_url) # Get the form content form = rb.get_form() if settings.DEBUG: print("Page:", rb) # We will be working with these form fields. # Set them as variables for easier re-use form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword" form_usr = "******" form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree" form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \ "CreateAccount" # Set the form field values form.fields[form_usr].value = username form.fields[form_pwd].value = password # There is a javascript popup after hitting submit # that the form_agree to "True" # Default in form is "False" form.fields[form_agree].value = "True" # Remove the CreateAccount field. It seems to drive the form # to the registration page. form.fields.pop(form_create_acc) # Capture the dynamic elements from these damned aspnetForms # We need to feed them back to allow the form to validate VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value VIEWSTATE = form.fields['__VIEWSTATE']._value # Set the validator fields back in to the form form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR form.fields['__VIEWSTATE'].value = VIEWSTATE form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION # Prepare the form for submission form.serialize() # submit the form rb.submit_form(form) # If the login was successful then we would be redirected to the dashboard. if rb.url == "https://www.mymedicare.gov/dashboard.aspx": """The login worked.""" # Get the name my_name = rb.find("li", {"id": "welcomeli"}) if my_name: my_name = my_name.contents[0].replace("Welcome, ", "") split_name = my_name.split(' ') first_name = split_name[0] last_name = split_name[-1] if not last_name: last_name = split_name[-2] try: user = User.objects.get(username=username) except User.DoesNotExist: # Create a new user. Note that we can set password # to anything, because it won't be checked; the password # from the external backend is checked (coming from settings). user = User(username=username, password='******', first_name=first_name, last_name=last_name) user.save() up, created = UserProfile.objects.get_or_create(user=user, user_type='BEN') group = Group.objects.get(name='BlueButton') user.groups.add(group) return user # The MyMedicare login failed. return None
def authenticate(self, username=None, password=None): username = username.rstrip().lstrip().lower() login_url = 'https://www.mymedicare.gov/default.aspx' rb = RoboBrowser() rb.parser = 'lxml' rb.open(login_url) # Get the form content form = rb.get_form() if settings.DEBUG: print("Page:", rb) # We will be working with these form fields. # Set them as variables for easier re-use form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword" form_usr = "******" form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree" form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \ "CreateAccount" # Set the form field values form.fields[form_usr].value = username form.fields[form_pwd].value = password # There is a javascript popup after hitting submit # that the form_agree to "True" # Default in form is "False" form.fields[form_agree].value = "True" # Remove the CreateAccount field. It seems to drive the form # to the registration page. form.fields.pop(form_create_acc) # Capture the dynamic elements from these damned aspnetForms # We need to feed them back to allow the form to validate VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value VIEWSTATE = form.fields['__VIEWSTATE']._value # Set the validator fields back in to the form form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR form.fields['__VIEWSTATE'].value = VIEWSTATE form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION # Prepare the form for submission form.serialize() # submit the form rb.submit_form(form) # If the login was successful then we would be redirected to the # dashboard. if rb.url == "https://www.mymedicare.gov/dashboard.aspx": """The login worked.""" # Get the name my_name = rb.find("li", {"id": "welcomeli"}) if my_name: my_name = my_name.contents[0].replace("Welcome, ", "") split_name = my_name.split(' ') first_name = split_name[0] last_name = split_name[-1] if not last_name: last_name = split_name[-2] try: user = User.objects.get(username=username) except User.DoesNotExist: # Create a new user. Note that we can set password # to anything, because it won't be checked; the password # from the external backend is checked (coming from settings). user = User(username=username, password='******', first_name=first_name, last_name=last_name) user.save() up, created = UserProfile.objects.get_or_create( user=user, user_type='BEN') group = Group.objects.get(name='BlueButton') user.groups.add(group) return user # The MyMedicare login failed. return None
def connect(request, mmg): """ Login to MyMedicare.gov using RoboBrowser :param request: :param username: :param password: :return: """ mmg_back = mmg mmg_back['status'] = "FAIL" PARSER = BS_PARSER if not PARSER: logger.debug('Default Parser for BeautifulSoup:', 'lxml') PARSER = 'lxml' login_url = 'https://www.mymedicare.gov/default.aspx' # This is for testing. Next step is to receive as parameters username = mmg['mmg_user'] # 'MBPUSER202A' # password = '******'# 'CMSPWD2USE' password = mmg['mmg_pwd'] # 'CMSPWD2USE' # Call the default page # We will then want to get the Viewstate and eventvalidation entries # we need to submit them with the form rb = RoboBrowser() mmg_back['robobrowser'] = rb # Set the default parser (lxml) # This avoids BeautifulSoup reporting an issue in the console/log rb.parser = PARSER # Open the form to start the login rb.open(login_url) # Get the form content form = rb.get_form() # if settings.DEBUG: # print("Page:", rb) # We will be working with these form fields. # Set them as variables for easier re-use form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword" form_usr = "******" form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree" # sign_in = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn" # EVENTTARGET = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn" form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \ "CreateAccount" # Set the form field values form.fields[form_usr].value = username form.fields[form_pwd].value = password # There is a javascript popup after hitting submit # It seems to set the following field to "True" # Default in form is "False" form.fields[form_agree].value = "True" # Remove the CreateAccount field. It seems to drive the form # to the registration page. form.fields.pop(form_create_acc) # Capture the dynamic elements from these damned aspnetForms # We need to feed them back to allow the form to validate VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value VIEWSTATE = form.fields['__VIEWSTATE']._value # if settings.DEBUG: # print("EventValidation:", EVENTVALIDATION ) # print("ViewStateGenerator:", VIEWSTATEGENERATOR) # Set the validator fields back in to the form form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR form.fields['__VIEWSTATE'].value = VIEWSTATE form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION # Prepare the form for submission form.serialize() # logger.debug("serialized form:", form) # submit the form rb.submit_form(form) # logger.debug("RB:", rb, "\nRB:", rb.__str__()) browser = RoboBrowser(history=True) if browser: pass # browser.parser = PARSER # logger.debug("Browser History:", browser.history, # "\nBrowser parser:", browser.parser, # # "\nPage html:", rb.parsed # ) if not rb.url == "https://www.mymedicare.gov/dashboard.aspx": err_msg = rb.find("span", {"id": "ctl00_ContentPlaceHolder1" "_ctl00_HomePage_lblError"}) if err_msg: err_msg = err_msg.contents messages.error(request, err_msg) messages.error(request, "We had a problem connecting to your" "Medicare account") mmg_back['status'] = "FAIL" mmg_back['url'] = rb.url return mmg_back # <ul id="headertoolbarright"> # <li class="welcometxt" id="welcomeli">Welcome, JOHN A DOE </li> my_name = rb.find("li", {"id": "welcomeli"}) if my_name: my_name = my_name.contents[0].replace("Welcome, ", "") my_account = rb.find("div", {"id": "RightContent"}) if my_account: my_account = my_account.prettify() my_account = my_account.replace('href="/', 'target="_blank" ' 'href="https://www.mymedicare.gov/') # my_account = my_account.contents # href="/mymessages.aspx" # href="/myaccount.aspx" # href="/plansandcoverage.aspx" # my_account.str('href="/mymessages.aspx', # 'href="https://www.mymedicare.gov/mymessages.apsx') # my_account.str('href="/myaccount.aspx', # 'href="https://www.mymedicare.gov/myaccount.aspx') # my_account.str('href="/plansandcoverage.aspx', # 'href="https://www.mymedicare.gov/plansandcoverage.aspx') # if settings.DEBUG: # print("\nMyAccount:", len(my_account), "|", my_account) # Need to pass data to context and then render to different # template with some data retrieved from MyMedicare.gov # If successfully logged in, Or return an error message. mmg_back['status'] = "OK" mmg_back['url'] = rb.url mmg_back['mmg_account'] = my_account mmg_back['mmg_name'] = my_name mmg_back['robobrowser'] = rb # logger.debug("RB post sign-in:", rb, # "rb url:", rb.url) return mmg_back
def connect(request, mmg): """ Login to MyMedicare.gov using RoboBrowser :param request: :param username: :param password: :return: """ mmg_back = mmg mmg_back['status'] = "FAIL" PARSER = BS_PARSER if not PARSER: logger.debug('Default Parser for BeautifulSoup:', 'lxml') PARSER = 'lxml' login_url = 'https://www.mymedicare.gov/default.aspx' # This is for testing. Next step is to receive as parameters username = mmg['mmg_user'] # 'MBPUSER202A' # password = '******'# 'CMSPWD2USE' password = mmg['mmg_pwd'] # 'CMSPWD2USE' # Call the default page # We will then want to get the Viewstate and eventvalidation entries # we need to submit them with the form rb = RoboBrowser() mmg_back['robobrowser'] = rb # Set the default parser (lxml) # This avoids BeautifulSoup reporting an issue in the console/log rb.parser = PARSER # Open the form to start the login rb.open(login_url) # Get the form content form = rb.get_form() # if settings.DEBUG: # print("Page:", rb) # We will be working with these form fields. # Set them as variables for easier re-use form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword" form_usr = "******" form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree" # sign_in = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn" # EVENTTARGET = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn" form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \ "CreateAccount" # Set the form field values form.fields[form_usr].value = username form.fields[form_pwd].value = password # There is a javascript popup after hitting submit # It seems to set the following field to "True" # Default in form is "False" form.fields[form_agree].value = "True" # Remove the CreateAccount field. It seems to drive the form # to the registration page. form.fields.pop(form_create_acc) # Capture the dynamic elements from these damned aspnetForms # We need to feed them back to allow the form to validate VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value VIEWSTATE = form.fields['__VIEWSTATE']._value # if settings.DEBUG: # print("EventValidation:", EVENTVALIDATION ) # print("ViewStateGenerator:", VIEWSTATEGENERATOR) # Set the validator fields back in to the form form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR form.fields['__VIEWSTATE'].value = VIEWSTATE form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION # Prepare the form for submission form.serialize() # logger.debug("serialized form:", form) # submit the form rb.submit_form(form) # logger.debug("RB:", rb, "\nRB:", rb.__str__()) browser = RoboBrowser(history=True) if browser: pass # browser.parser = PARSER # logger.debug("Browser History:", browser.history, # "\nBrowser parser:", browser.parser, # # "\nPage html:", rb.parsed # ) if not rb.url == "https://www.mymedicare.gov/dashboard.aspx": err_msg = rb.find( "span", {"id": "ctl00_ContentPlaceHolder1" "_ctl00_HomePage_lblError"}) if err_msg: err_msg = err_msg.contents messages.error(request, err_msg) messages.error( request, "We had a problem connecting to your" "Medicare account") mmg_back['status'] = "FAIL" mmg_back['url'] = rb.url return mmg_back # <ul id="headertoolbarright"> # <li class="welcometxt" id="welcomeli">Welcome, JOHN A DOE </li> my_name = rb.find("li", {"id": "welcomeli"}) if my_name: my_name = my_name.contents[0].replace("Welcome, ", "") my_account = rb.find("div", {"id": "RightContent"}) if my_account: my_account = my_account.prettify() my_account = my_account.replace( 'href="/', 'target="_blank" ' 'href="https://www.mymedicare.gov/') # my_account = my_account.contents # href="/mymessages.aspx" # href="/myaccount.aspx" # href="/plansandcoverage.aspx" # my_account.str('href="/mymessages.aspx', # 'href="https://www.mymedicare.gov/mymessages.apsx') # my_account.str('href="/myaccount.aspx', # 'href="https://www.mymedicare.gov/myaccount.aspx') # my_account.str('href="/plansandcoverage.aspx', # 'href="https://www.mymedicare.gov/plansandcoverage.aspx') # if settings.DEBUG: # print("\nMyAccount:", len(my_account), "|", my_account) # Need to pass data to context and then render to different # template with some data retrieved from MyMedicare.gov # If successfully logged in, Or return an error message. mmg_back['status'] = "OK" mmg_back['url'] = rb.url mmg_back['mmg_account'] = my_account mmg_back['mmg_name'] = my_name mmg_back['robobrowser'] = rb # logger.debug("RB post sign-in:", rb, # "rb url:", rb.url) return mmg_back
#!/usr/bin/env python # -*- encoding: utf-8 -*- from robobrowser import RoboBrowser from requests import Session from fake_useragent import UserAgent import re url = 'http://www.baidu.com' ua = UserAgent() keyword = 'sp68' s = Session() br = RoboBrowser(session=s, history=True, user_agent=ua.chrome) br.parser = 'lxml' br.timeout = 1 br.open(url) form = br.get_form(action='/s') form['wd'].value = keyword br.submit_form(form) print br.url for link in br.find_all('a', href=re.compile("^http://www.baidu.com/baidu.php")): print link['href'] s.close()
def get_medicare_email(request, mmg): """ :param request: :param mmg: :return: """ mmg_back = mmg mmg_back['status'] = "FAIL" mmg_back['mmg_email'] = "" PARSER = settings.BS_PARSER if not PARSER: if settings.DEBUG: print('Default Parser for BeautifulSoup:', 'lxml') PARSER = 'lxml' # Call the default page rb = RoboBrowser() # Set the default parser (lxml) # This avoids BeautifulSoup reporting an issue in the console/log rb.parser = PARSER target_page = "https://www.mymedicare.gov/myaccount.aspx" # Open the form to start the login rb.open(target_page) # Get the form content page = rb.parsed if settings.DEBUG: print("===============================") print("on page:", rb.url) print("MyAccount:", page) my_email = rb.find("div", attrs={"class":"ctl00_ctl00_ContentPlaceHolder1_ctl00_ctl00_ctl00_ctl01_UserInfo_pnlEmailSettings"}) if settings.DEBUG: print("What email information:", my_email) for addr in my_email: mail_addr = my_email.find("div", attrs={"class": "myaccount-data"}) mail_address = mail_addr.text mmg_back['mmg_email'] = mail_address if rb.url == target_page: mmg_back['url'] = rb.url mmg_back['status'] = "OK" if settings.DEBUG: print("Email:", mail_address) print("url:", rb.url) return mmg_back