Exemplo n.º 1
0
def getDistrict(m_url, element):
    driver = utils.getDriver(m_url)
    mySelect_D = Select(driver.find_element_by_id(element))
    num_D = len(mySelect_D.options)  # Start from 1, 0 -- Select
    return driver, mySelect_D, num_D
Exemplo n.º 2
0
import re
import urllib
sys.path.insert(0, '../tools/')
import utils

mdir = '../data/TN/'
m_url = "http://elections.tn.gov.in/PDF/"

i_start = 1
i_end = 235  # 235
j_start = 0

for i in range(i_start, i_end):
    time.sleep(5)
    p_url = m_url + "ac{}.htm".format(i)
    driver = utils.getDriver(p_url)
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    driver.quit()
    find_a = soup.find_all('a', attrs={'href': re.compile("^http://")})
    if len(find_a) == 0:
        find_a = soup.find_all('a', attrs={'href': re.compile("^dt")})
    for j in range(j_start, len(find_a)):
        time.sleep(1)
        print("\n", i, j)
        url = find_a[j]['href']
        fid = url.split("PDF/")[1].replace("/", "_")
        try:
            flag = utils.download_file(url, mdir, fid)
            if flag == 0:
                with open("tn.txt", "a") as myfile:
Exemplo n.º 3
0
@author: dhingratul
"""
import time
from bs4 import BeautifulSoup
import urllib
import sys
sys.path.insert(0, '../tools/')
import utils

m_url = "http://ceomeghalaya.nic.in/erolls/erolldetails.html"
mdir = '../data/Meghalaya/'
page_url = "http://ceomeghalaya.nic.in/erolls/"
base_url = "http://ceomeghalaya.nic.in/erolls/pdf/english/"

driver = utils.getDriver(m_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
table = soup.find('table')
find_a = table.find_all('a', href=True)
driver.quit()
i_start = 1
j_start = 1
for i in range(i_start, len(find_a)):
    const_url = page_url + find_a[i]['href']
    ac = int(find_a[i]['href'].split("-")[0])
    driver = utils.getDriver(const_url)
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    rows = soup.findAll('tr')
    driver.quit()
Exemplo n.º 4
0
def checkIdP(sp, idp, test):

    # Disable SSL requests warning messages
    requests.packages.urllib3.disable_warnings()

    debug_selenium = ECCS2SELENIUMDEBUG
    label_idp = getIDPlabel(idp['entityID'])
    # WebDriver MUST be instanced here to avoid problems with SESSION
    driver = getDriver(label_idp, debug_selenium)

    # Exception of WebDriver raises
    if (driver == None):
        return None

    # Configure Blacklists
    #federations_disabled_list = FEDS_DISABLED_LIST
    #idps_disabled_list = IDPS_DISABLED_LIST
    federations_disabled_dict = FEDS_DISABLED_DICT
    idps_disabled_dict = IDPS_DISABLED_DICT

    fqdn_sp = parse_url(sp)[2]
    wayfless_url = sp + idp['entityID']

    robots = ""

    if (idp['registrationAuthority'] in federations_disabled_dict.keys()):
        check_time = datetime.datetime.utcnow().strftime(
            '%Y-%m-%dT%H:%M:%S') + 'Z'

        if (test is not True):
            with open(
                    "%s/%s/%s---%s.html" %
                (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html:
                html.write(
                    "%s" %
                    federations_disabled_dict[idp['registrationAuthority']])
        else:
            print("%s" %
                  federations_disabled_dict[idp['registrationAuthority']])

        return (idp['entityID'], wayfless_url, check_time, "NULL", "DISABLED")

    if (idp['entityID'] in idps_disabled_dict.keys()):
        check_time = datetime.datetime.utcnow().strftime(
            '%Y-%m-%dT%H:%M:%S') + 'Z'

        if (test is not True):
            with open(
                    "%s/%s/%s---%s.html" %
                (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html:
                html.write("%s" % idps_disabled_dict[idp['entityID']])
        else:
            print("%s" % idps_disabled_dict[idp['entityID']])

        return (idp['entityID'], wayfless_url, check_time, "NULL", "DISABLED")

    # Open SP, select the IDP from the EDS and press 'Enter' to reach the IdP login page to check
    try:
        check_time = datetime.datetime.utcnow().strftime(
            '%Y-%m-%dT%H:%M:%S') + 'Z'
        driver.get(wayfless_url)
        page_source = driver.page_source
        samlrequest_url = driver.current_url

        if (test is not True):
            # Put the page_source into an appropriate HTML file
            with open(
                    "%s/%s/%s---%s.html" %
                (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html:
                html.write(page_source)
        else:
            print("\n[page_source of '%s' for sp '%s']\n%s" %
                  (label_idp, fqdn_sp, page_source))

    except TimeoutException as e:
        if (test is not True):
            # Put an empty string into the page_source file
            with open(
                    "%s/%s/%s---%s.html" %
                (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html:
                html.write("")
        else:
            print("\n[page_source of '%s' for sp '%s']\nNo source code" %
                  (label_idp, fqdn_sp))
        return (idp['entityID'], wayfless_url, check_time, "(failed)",
                "Timeout")

    except Exception as e:
        print("!!! EXCEPTION DRIVER !!!")
        print(e.__str__())
        print("IdP: %s\nSP: %s" % (idp['entityID'], sp))
        return None

    finally:
        driver.quit()

    try:
        headers = {'User-Agent': '%s' % ROBOTS_USER_AGENT}

        fqdn_idp = getIDPfqdn(samlrequest_url)

        robots = requests.get("https://%s/robots.txt" % fqdn_idp,
                              headers=headers,
                              verify=True,
                              timeout=ECCS2REQUESTSTIMEOUT)

        if (robots == ""):
            robots = requests.get("http://%s/robots.txt" % fqdn_idp,
                                  headers=headers,
                                  verify=True,
                                  timeout=ECCS2REQUESTSTIMEOUT)

    # Catch only SSL Exception. Don't block the ECCS check if other exceptions occurred
    except (requests.exceptions.SSLError) as e:
        check_time = datetime.datetime.utcnow().strftime(
            '%Y-%m-%dT%H:%M:%S') + 'Z'

        if (test is not True):
            with open(
                    "%s/%s/%s---%s.html" %
                (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html:
                html.write(
                    "<p>IdP excluded from check due the following SSL Error:<br/><br/>%s</p><p>Check it on SSL Labs: <a href='https://www.ssllabs.com/ssltest/analyze.html?d=%s'>Click Here</a></p>"
                    % (e.__str__(), fqdn_idp))
        else:
            print(
                "IdP excluded from check due the following SSL Error:\n\n%s\n\nCheck it on SSL Labs: https://www.ssllabs.com/ssltest/analyze.html?d=%s"
                % (e.__str__(), fqdn_idp))

        return (idp['entityID'], wayfless_url, check_time, "(failed)",
                "SSL-Error")

    # Pass every other exceptions on /robots.txt file. I consider only SSLError.
    except Exception as e:
        #print("IdP '%s' HAD HAD A REQUEST ERROR: %s" % (fqdn_idp,e.__str__()))
        robots = ""

    if (robots):
        check_time = datetime.datetime.utcnow().strftime(
            '%Y-%m-%dT%H:%M:%S') + 'Z'

        p = re.compile('^User-agent:\sECCS\sDisallow:\s\/\s*$', re.MULTILINE)
        m = p.search(robots.text)

        if (m):
            if (test is not True):
                with open(
                        "%s/%s/%s---%s.html" %
                    (ECCS2HTMLDIR, DAY, label_idp, fqdn_sp), "w") as html:
                    html.write("IdP excluded from check by robots.txt")
            else:
                print("IdP excluded from check by robots.txt")

            return (idp['entityID'], wayfless_url, check_time, "NULL",
                    "DISABLED")

    pattern_metadata = "Unable.to.locate(\sissuer.in|).metadata(\sfor|)|no.metadata.found|profile.is.not.configured.for.relying.party|Cannot.locate.entity|fail.to.load.unknown.provider|does.not.recognise.the.service|unable.to.load.provider|Nous.n'avons.pas.pu.(charg|charger).le.fournisseur.de service|Metadata.not.found|application.you.have.accessed.is.not.registered.for.use.with.this.service|Message.did.not.meet.security.requirements"

    pattern_username = '******'"](text|email)[\'"]|user)|(name=\s*[\'"](name)[\'"]))[^>]*>'
    pattern_password = '******'"]password[\'"]|password)[^>]*>'

    metadata_not_found = re.search(pattern_metadata, page_source, re.I)
    username_found = re.search(pattern_username, page_source, re.I)
    password_found = re.search(pattern_password, page_source, re.I)

    try:
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
        }
        status_code = str(
            requests.get(samlrequest_url,
                         headers=headers,
                         verify=False,
                         timeout=ECCS2REQUESTSTIMEOUT).status_code)

    except requests.exceptions.ConnectionError as e:
        print(
            "status-code: (failed) - ConnectionError for IdP '%s' with SP '%s'"
            % (idp['entityID'], sp))
        #print("!!! REQUESTS STATUS CODE CONNECTION ERROR EXCEPTION !!!")
        #print (e.__str__())
        status_code = "(failed)"

    except requests.exceptions.Timeout as e:
        print("status-code: 111 - TimeoutError for IdP '%s' with SP '%s'" %
              (idp['entityID'], sp))
        #print("!!! REQUESTS STATUS CODE TIMEOUT EXCEPTION !!!")
        #print (e.__str__())
        status_code = "111"

    except requests.exceptions.TooManyRedirects as e:
        print(
            "status-code: 222 - TooManyRedirectsError for IdP '%s' with SP '%s'"
            % (idp['entityID'], sp))
        #print("!!! REQUESTS TOO MANY REDIRECTS EXCEPTION !!!")
        #print (e.__str__())
        status_code = "222"

    except requests.exceptions.RequestException as e:
        print("status-code: 333 - RequestException for IdP '%s' with SP '%s'" %
              (idp['entityID'], sp))
        #print ("!!! REQUESTS EXCEPTION !!!")
        print(e.__str__())
        status_code = "333"

    except Exception as e:
        print("status-code: 555 - OtherException for IdP '%s' with SP '%s'" %
              (idp['entityID'], sp))
        #print ("!!! EXCEPTION REQUESTS !!!")
        print(e.__str__())
        status_code = "555"

    if (metadata_not_found):
        return (idp['entityID'], wayfless_url, check_time, status_code,
                "No-eduGAIN-Metadata")
    elif not username_found or not password_found:
        return (idp['entityID'], wayfless_url, check_time, status_code,
                "Invalid-Form")
    else:
        return (idp['entityID'], wayfless_url, check_time, status_code, "OK")