def getDistrict(m_url, element): driver = utils.getDriver(m_url) mySelect_D = Select(driver.find_element_by_id(element)) num_D = len(mySelect_D.options) # Start from 1, 0 -- Select return driver, mySelect_D, num_D
import re import urllib sys.path.insert(0, '../tools/') import utils mdir = '../data/TN/' m_url = "http://elections.tn.gov.in/PDF/" i_start = 1 i_end = 235 # 235 j_start = 0 for i in range(i_start, i_end): time.sleep(5) p_url = m_url + "ac{}.htm".format(i) driver = utils.getDriver(p_url) html = driver.page_source soup = BeautifulSoup(html, "lxml") driver.quit() find_a = soup.find_all('a', attrs={'href': re.compile("^http://")}) if len(find_a) == 0: find_a = soup.find_all('a', attrs={'href': re.compile("^dt")}) for j in range(j_start, len(find_a)): time.sleep(1) print("\n", i, j) url = find_a[j]['href'] fid = url.split("PDF/")[1].replace("/", "_") try: flag = utils.download_file(url, mdir, fid) if flag == 0: with open("tn.txt", "a") as myfile:
@author: dhingratul """ import time from bs4 import BeautifulSoup import urllib import sys sys.path.insert(0, '../tools/') import utils m_url = "http://ceomeghalaya.nic.in/erolls/erolldetails.html" mdir = '../data/Meghalaya/' page_url = "http://ceomeghalaya.nic.in/erolls/" base_url = "http://ceomeghalaya.nic.in/erolls/pdf/english/" driver = utils.getDriver(m_url) html = driver.page_source soup = BeautifulSoup(html, "lxml") table = soup.find('table') find_a = table.find_all('a', href=True) driver.quit() i_start = 1 j_start = 1 for i in range(i_start, len(find_a)): const_url = page_url + find_a[i]['href'] ac = int(find_a[i]['href'].split("-")[0]) driver = utils.getDriver(const_url) html = driver.page_source soup = BeautifulSoup(html, "lxml") rows = soup.findAll('tr') driver.quit()
def checkIdP(sp, idp, test): # Disable SSL requests warning messages requests.packages.urllib3.disable_warnings() debug_selenium = ECCS2SELENIUMDEBUG label_idp = getIDPlabel(idp['entityID']) # WebDriver MUST be instanced here to avoid problems with SESSION driver = getDriver(label_idp, debug_selenium) # Exception of WebDriver raises if (driver == None): return None # Configure Blacklists #federations_disabled_list = FEDS_DISABLED_LIST #idps_disabled_list = IDPS_DISABLED_LIST federations_disabled_dict = FEDS_DISABLED_DICT idps_disabled_dict = IDPS_DISABLED_DICT fqdn_sp = parse_url(sp)[2] wayfless_url = sp + idp['entityID'] robots = "" if (idp['registrationAuthority'] in federations_disabled_dict.keys()): check_time = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%S') + 'Z' if (test is not True): with open( "%s/%s/%s---%s.html" % (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html: html.write( "%s" % federations_disabled_dict[idp['registrationAuthority']]) else: print("%s" % federations_disabled_dict[idp['registrationAuthority']]) return (idp['entityID'], wayfless_url, check_time, "NULL", "DISABLED") if (idp['entityID'] in idps_disabled_dict.keys()): check_time = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%S') + 'Z' if (test is not True): with open( "%s/%s/%s---%s.html" % (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html: html.write("%s" % idps_disabled_dict[idp['entityID']]) else: print("%s" % idps_disabled_dict[idp['entityID']]) return (idp['entityID'], wayfless_url, check_time, "NULL", "DISABLED") # Open SP, select the IDP from the EDS and press 'Enter' to reach the IdP login page to check try: check_time = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%S') + 'Z' driver.get(wayfless_url) page_source = driver.page_source samlrequest_url = driver.current_url if (test is not True): # Put the page_source into an appropriate HTML file with open( "%s/%s/%s---%s.html" % (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html: html.write(page_source) else: print("\n[page_source of '%s' for sp '%s']\n%s" % (label_idp, fqdn_sp, page_source)) except TimeoutException as e: if (test is not True): # Put an empty string into the page_source file with open( "%s/%s/%s---%s.html" % (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html: html.write("") else: print("\n[page_source of '%s' for sp '%s']\nNo source code" % (label_idp, fqdn_sp)) return (idp['entityID'], wayfless_url, check_time, "(failed)", "Timeout") except Exception as e: print("!!! EXCEPTION DRIVER !!!") print(e.__str__()) print("IdP: %s\nSP: %s" % (idp['entityID'], sp)) return None finally: driver.quit() try: headers = {'User-Agent': '%s' % ROBOTS_USER_AGENT} fqdn_idp = getIDPfqdn(samlrequest_url) robots = requests.get("https://%s/robots.txt" % fqdn_idp, headers=headers, verify=True, timeout=ECCS2REQUESTSTIMEOUT) if (robots == ""): robots = requests.get("http://%s/robots.txt" % fqdn_idp, headers=headers, verify=True, timeout=ECCS2REQUESTSTIMEOUT) # Catch only SSL Exception. Don't block the ECCS check if other exceptions occurred except (requests.exceptions.SSLError) as e: check_time = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%S') + 'Z' if (test is not True): with open( "%s/%s/%s---%s.html" % (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html: html.write( "<p>IdP excluded from check due the following SSL Error:<br/><br/>%s</p><p>Check it on SSL Labs: <a href='https://www.ssllabs.com/ssltest/analyze.html?d=%s'>Click Here</a></p>" % (e.__str__(), fqdn_idp)) else: print( "IdP excluded from check due the following SSL Error:\n\n%s\n\nCheck it on SSL Labs: https://www.ssllabs.com/ssltest/analyze.html?d=%s" % (e.__str__(), fqdn_idp)) return (idp['entityID'], wayfless_url, check_time, "(failed)", "SSL-Error") # Pass every other exceptions on /robots.txt file. I consider only SSLError. except Exception as e: #print("IdP '%s' HAD HAD A REQUEST ERROR: %s" % (fqdn_idp,e.__str__())) robots = "" if (robots): check_time = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%S') + 'Z' p = re.compile('^User-agent:\sECCS\sDisallow:\s\/\s*$', re.MULTILINE) m = p.search(robots.text) if (m): if (test is not True): with open( "%s/%s/%s---%s.html" % (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html: html.write("IdP excluded from check by robots.txt") else: print("IdP excluded from check by robots.txt") return (idp['entityID'], wayfless_url, check_time, "NULL", "DISABLED") pattern_metadata = "Unable.to.locate(\sissuer.in|).metadata(\sfor|)|no.metadata.found|profile.is.not.configured.for.relying.party|Cannot.locate.entity|fail.to.load.unknown.provider|does.not.recognise.the.service|unable.to.load.provider|Nous.n'avons.pas.pu.(charg|charger).le.fournisseur.de service|Metadata.not.found|application.you.have.accessed.is.not.registered.for.use.with.this.service|Message.did.not.meet.security.requirements" pattern_username = '******'"](text|email)[\'"]|user)|(name=\s*[\'"](name)[\'"]))[^>]*>' pattern_password = '******'"]password[\'"]|password)[^>]*>' metadata_not_found = re.search(pattern_metadata, page_source, re.I) username_found = re.search(pattern_username, page_source, re.I) password_found = re.search(pattern_password, page_source, re.I) try: headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' } status_code = str( requests.get(samlrequest_url, headers=headers, verify=False, timeout=ECCS2REQUESTSTIMEOUT).status_code) except requests.exceptions.ConnectionError as e: print( "status-code: (failed) - ConnectionError for IdP '%s' with SP '%s'" % (idp['entityID'], sp)) #print("!!! REQUESTS STATUS CODE CONNECTION ERROR EXCEPTION !!!") #print (e.__str__()) status_code = "(failed)" except requests.exceptions.Timeout as e: print("status-code: 111 - TimeoutError for IdP '%s' with SP '%s'" % (idp['entityID'], sp)) #print("!!! REQUESTS STATUS CODE TIMEOUT EXCEPTION !!!") #print (e.__str__()) status_code = "111" except requests.exceptions.TooManyRedirects as e: print( "status-code: 222 - TooManyRedirectsError for IdP '%s' with SP '%s'" % (idp['entityID'], sp)) #print("!!! REQUESTS TOO MANY REDIRECTS EXCEPTION !!!") #print (e.__str__()) status_code = "222" except requests.exceptions.RequestException as e: print("status-code: 333 - RequestException for IdP '%s' with SP '%s'" % (idp['entityID'], sp)) #print ("!!! REQUESTS EXCEPTION !!!") print(e.__str__()) status_code = "333" except Exception as e: print("status-code: 555 - OtherException for IdP '%s' with SP '%s'" % (idp['entityID'], sp)) #print ("!!! EXCEPTION REQUESTS !!!") print(e.__str__()) status_code = "555" if (metadata_not_found): return (idp['entityID'], wayfless_url, check_time, status_code, "No-eduGAIN-Metadata") elif not username_found or not password_found: return (idp['entityID'], wayfless_url, check_time, status_code, "Invalid-Form") else: return (idp['entityID'], wayfless_url, check_time, status_code, "OK")